use lists of urls for transcripts and subtitiles

jdungan · jdungan · commit cfff8989986d · 2025-04-10T10:05:57.000-05:00
diff --git a/db/queries.py b/db/queries.py
@@ -14,4 +14,4 @@ def get_meetings(days: int = 7) -> Sequence[Meeting]:
         A.date >= target_date,
     )
 
-    return Sequence(meetings)
+    return list(meetings)
diff --git a/flows/add_subtitles.py b/flows/add_subtitles.py
@@ -1,33 +1,49 @@
 from prefect import flow
+from dyntastic import A
+from pydantic import HttpUrl
+
+from src.models.subtitles import Transcript
 
 from tasks.subtitles import create_vtt_track
 from db.queries import get_meetings
-
-from src.models.subtitles import SubtitleTrack, TrackFormat
 from src.aws import save_content_to_s3
+from src.http_utils import async_get_json
 
 
 @flow(log_prints=True)
 async def add_subtitles():
     meetings = get_meetings(days=90)
-    meetings_with_transcript = [
+    meetings_with_transcripts = [
         meeting
         for meeting in meetings
-        if hasattr(meeting, "transcript") and meeting.transcript
+        if hasattr(meeting, "transcripts") and meeting.transcripts is not None or []
     ]
-    for meeting in meetings_with_transcript:
-        if not meeting.subtitles:
+    for meeting in meetings_with_transcripts:
+        for transcript_url in meeting.transcripts:
+            transcript_data = await async_get_json(transcript_url.encoded_string())
+            transcript = Transcript.model_validate(transcript_data)
+            language = transcript.language
+            if f"{language}.vtt" in meeting.subtitles:
+                continue
             track_content = await create_vtt_track(
-                meeting.transcript,
+                transcript,
                 include_speaker_prefix=False,
             )
-            save_content_to_s3(
+            result: HttpUrl = save_content_to_s3(
                 track_content,
                 "tgov-assets",
-                f"{meeting.filename()}.subtitles.vtt",
+                f"{meeting.filename()}/subtitles/{language}.vtt",
                 "text/vtt",
             )
-    return track_content
+            if not meeting.subtitles:
+                meeting.subtitles = [result]
+            else:
+                (
+                    meeting.subtitles.append(result)
+                    if result not in meeting.subtitles
+                    else None
+                )
+            meeting.save()
 
 
 if __name__ == "__main__":
diff --git a/notebooks/srt_subtitles.ipynb b/notebooks/srt_subtitles.ipynb
@@ -28,7 +28,7 @@
     "sys.path.append(\"../\")\n",
     "\n",
     "# Import the necessary modules from the new subtitles package\n",
-    "from src.subtitles import create_track"
+    "from src.subtitles import create_subtitles"
    ]
   },
   {
@@ -51,9 +51,9 @@
     "    \"../data/transcripts/regular_council_meeting___2025_02_26.diarized.json\"\n",
     ")\n",
     "# Create an SRT track from the transcript\n",
-    "srt_track = create_track(\n",
+    "srt_track = create_subtitles(\n",
     "    transcript_data=transcript_file,\n",
-    "    format='srt',\n",
+    "    format=\"srt\",\n",
     "    max_duration=5.0,  # Maximum duration for each subtitle\n",
     "    max_length=80,  # Maximum length in characters\n",
     "    include_speaker_prefix=True,  # Include speaker labels\n",
@@ -120,7 +120,7 @@
    "outputs": [],
    "source": [
     "# Import the subtitles module\n",
-    "from src.subtitles import create_track\n",
+    "from src.subtitles import create_subtitles\n",
     "import subprocess\n",
     "\n",
     "# Path to the SRT file we saved earlier\n",
@@ -155,8 +155,7 @@
     "    cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE\n",
     ")\n",
     "print(\"Successfully created video with embedded subtitles!\")\n",
-    "print(f\"Output video saved to: {output_video}\")\n",
-    "\n"
+    "print(f\"Output video saved to: {output_video}\")"
    ]
   },
   {
diff --git a/src/aws.py b/src/aws.py
@@ -2,6 +2,7 @@
 
 import boto3
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
+from pydantic import HttpUrl
 
 
 def is_aws_configured():
@@ -41,9 +42,12 @@ def upload_to_s3(file_path, bucket_name, s3_path):
 
 
 def save_content_to_s3(content, bucket_name, s3_key, content_type):
-    return s3_client.put_object(
+    response = s3_client.put_object(
         Bucket=bucket_name,
         Key=s3_key,
         Body=content.encode("utf-8"),
         ContentType=content_type,
     )
+    region = s3_client.meta.region_name
+    url = f"https://{bucket_name}.s3.{region}.amazonaws.com/{s3_key}"
+    return HttpUrl(url)
diff --git a/src/models/meeting.py b/src/models/meeting.py
@@ -7,7 +7,7 @@
 from dyntastic import Dyntastic
 from pydantic import BaseModel, Field, HttpUrl
 from datetime import datetime
-from typing import Sequence
+from typing import Sequence, List
 
 
 def clean_filename(meeting_name: str) -> str:
@@ -24,10 +24,10 @@ class Meeting(Dyntastic):
     duration: str = Field(description="Duration of the meeting")
     agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
     video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")
-    transcript: Optional[HttpUrl] = Field(
-        None, description="URL to the meeting transcript"
+    transcripts: Optional[List[HttpUrl]] = Field(
+        None, description="URLs to the meeting transcripts"
     )
-    subtitles: Optional[HttpUrl] = Field(
+    subtitles: Optional[List[HttpUrl]] = Field(
         None, description="URLs to the meeting subtitle tracks"
     )
 
@@ -36,7 +36,7 @@ def __str__(self) -> str:
         return f"{self.meeting} ({self.date})"
 
     def filename(self) -> str:
-        return f"{self.clip_id}/{clean_filename(self.meeting)} ({self.date})"
+        return f"{self.clip_id}/{clean_filename(self.meeting)}/({self.date})"
 
 
 class GranicusPlayerPage(BaseModel):
diff --git a/src/subtitles.py b/src/subtitles.py
@@ -189,7 +189,6 @@ def chunk_transcript(
     """
     chunks = []
     current_chunk = {"text": "", "start": 0, "end": 0, "speaker": "", "words": []}
-
     for segment in transcript.segments:
         # Skip very short segments
         if segment.end - segment.start < 0.1:
@@ -455,7 +454,6 @@ def create_subtitles(
     """
     # Normalize track format to TrackFormat enum internally
     track_format = TrackFormat(format.lower())
-
     # Chunk the transcript
     chunks = chunk_transcript(
         transcript,
diff --git a/tasks/subtitles.py b/tasks/subtitles.py
@@ -2,16 +2,15 @@
 from src.subtitles import create_subtitles
 from src.models.subtitles import SubtitleTrack, TrackFormat, Transcript
 from pydantic import HttpUrl
-from src.http_utils import async_get_json
+
 from src.models.subtitles import TrackFormat
 
 
 @task
 async def create_vtt_track(
-    transcript_url: HttpUrl, include_speaker_prefix: bool = False
+    transcript: Transcript, include_speaker_prefix: bool = False
 ) -> str:
-    transcript_data = await async_get_json(transcript_url.encoded_string())
-    transcript = Transcript.model_validate(transcript_data)
+
     vtt_track: SubtitleTrack = create_subtitles(
         transcript,
         format=TrackFormat.VTT,
diff --git a/tests/test_subtitles.py b/tests/test_subtitles.py
@@ -5,11 +5,8 @@
 import pytest
 from pathlib import Path
 from src.subtitles import (
-    create_track,
-    format_time_for_srt,
+    create_subtitles,
     format_time_for_vtt,
-    format_time_for_ass,
-    load_transcript,
     chunk_transcript,
 )
 from src.models.subtitles import (
@@ -23,6 +20,14 @@
 )
 
 
+def load_transcript(fixture_transcript_path) -> Transcript:
+    """Load the transcript fixture."""
+    with open(fixture_transcript_path, "r") as f:
+        transcript_data = f.read()
+    transcript = Transcript.model_validate_json(transcript_data)
+    return transcript
+
+
 @pytest.fixture
 def fixture_transcript_path() -> Path:
     """Fixture for the path to our mock transcript fixture."""
@@ -31,65 +36,13 @@ def fixture_transcript_path() -> Path:
 
 @pytest.fixture
 def fixture_transcript(fixture_transcript_path) -> Transcript:
-    """Load the transcript fixture."""
     return load_transcript(fixture_transcript_path)
 
 
-def test_format_time_for_srt():
-    """Test SRT timestamp formatting."""
-    assert format_time_for_srt(0) == "00:00:00,000"
-    assert format_time_for_srt(3661.5) == "01:01:01,500"
-    assert format_time_for_srt(123.456) == "00:02:03,456"
-
-
-def test_format_time_for_vtt():
-    """Test VTT timestamp formatting."""
-    assert format_time_for_vtt(0) == "00:00:00.000"
-    assert format_time_for_vtt(3661.5) == "01:01:01.500"
-    assert format_time_for_vtt(123.456) == "00:02:03.456"
-
-
-def test_format_time_for_ass():
-    """Test ASS timestamp formatting."""
-    assert format_time_for_ass(0) == "0:00:00.00"
-    assert format_time_for_ass(3661.5) == "1:01:01.50"
-    assert format_time_for_ass(123.456) == "0:02:03.45"
-
-
-def test_create_srt_track(fixture_transcript_path):
-    """Test creating an SRT track from a transcript."""
-    srt_track = create_track(
-        fixture_transcript_path,
-        format="srt",
-        max_duration=5.0,
-        max_words=14,
-        include_speaker_prefix=True,
-    )
-
-    # Verify track metadata
-    assert srt_track.metadata.format == TrackFormat.SRT
-    assert srt_track.metadata.language == "en"
-
-    # Verify that entries were created
-    assert len(srt_track.entries) > 0
-    assert all(isinstance(entry, SrtEntry) for entry in srt_track.entries)
-
-    # Check that speaker prefixes were added
-    assert any("[Speaker" in entry.text for entry in srt_track.entries)
-
-    # Test content generation using unified method
-    srt_content = srt_track.content()
-    assert srt_content.startswith("1\n")
-    assert "-->" in srt_content
-
-    # Verify old method still works for backward compatibility
-    assert srt_track.to_srt_content() == srt_content
-
-
-def test_create_vtt_track(fixture_transcript_path):
+def test_create_vtt_track(fixture_transcript):
     """Test creating a VTT track from a transcript."""
-    vtt_track = create_track(
-        fixture_transcript_path,
+    vtt_track = create_subtitles(
+        fixture_transcript,
         format="vtt",
         max_duration=4.0,
         max_words=12,
@@ -108,53 +61,8 @@ def test_create_vtt_track(fixture_transcript_path):
     assert vtt_content.startswith("WEBVTT")
     assert "-->" in vtt_content
 
-    # Verify old method still works for backward compatibility
-    assert vtt_track.to_vtt_content() == vtt_content
-
-
-def test_create_ass_track(fixture_transcript_path):
-    """Test creating an ASS track from a transcript."""
-    ass_track = create_track(
-        fixture_transcript_path,
-        format="ass",
-        font_size=28,
-        bg_opacity=0.3,
-    )
-
-    # Verify track metadata
-    assert ass_track.metadata.format == TrackFormat.ASS
-    assert ass_track.metadata.language == "en"
-    assert ass_track.metadata.style.font_size == 28
-
-    # Verify that entries were created
-    assert len(ass_track.entries) > 0
-    assert all(isinstance(entry, AssEntry) for entry in ass_track.entries)
-
-    # Test content generation using unified method
-    ass_content = ass_track.content()
-    assert "[Script Info]" in ass_content
-    assert "Dialogue:" in ass_content
-
-    # Verify old method still works for backward compatibility
-    assert ass_track.to_ass_content() == ass_content
-
-
-def test_load_transcript(fixture_transcript_path):
-    """Test that a transcript can be loaded from a JSON file."""
-    transcript = load_transcript(fixture_transcript_path)
-    assert transcript.language == "en"
-    assert len(transcript.segments) > 0
-
-    # Test at least one speaker from our fixture
-    speakers = set(
-        segment.speaker for segment in transcript.segments if segment.speaker
-    )
-    assert "SPEAKER_01" in speakers
-    assert "SPEAKER_02" in speakers
-
 
 def test_chunk_transcript(fixture_transcript):
-    """Test the transcript chunking functionality."""
     chunks = chunk_transcript(
         fixture_transcript,
         max_duration=5.0,

Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,4 @@ def get_meetings(days: int = 7) -> Sequence[Meeting]:`
`14`	`14`	`A.date >= target_date,`
`15`	`15`	`)`
`16`	`16`
`17`		`- return Sequence(meetings)`
	`17`	`+ return list(meetings)`