Skip to content

Commit 092c1f9

Browse files
committed
start diarization flow
1 parent a95c8b8 commit 092c1f9

File tree

14 files changed

+116
-46
lines changed

14 files changed

+116
-46
lines changed

.github/workflows/python-app.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Build/run tgov
22

33
on:
44
push:
5-
branches: [ "main", "deploy-lambda" ]
5+
branches: [ "main", "deploy-lambda", "test-flows" ]
66
pull_request:
77
branches: [ "main" ]
88

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,4 @@ RUN poetry install
2323
ENV PYTHONPATH=/app
2424
ENV PATH="${POETRY_VENV}/bin:${PATH}"
2525

26-
CMD ["python", "src/run_diarization.py"]
26+
CMD ["python", "-m", "flows.translate_meetings"]

db/queries.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from datetime import datetime, timedelta
2-
from typing import Sequence
2+
from typing import List, Optional
33
from dyntastic import A
44
from src.models.meeting import Meeting
55

66

7-
def get_meetings(days: int = 7) -> Sequence[Meeting]:
7+
def get_meetings(days: int = 7, video: Optional[bool] = None) -> List[Meeting]:
88
"""
99
Get meetings that occurred in the past number of days from now.
1010
"""
@@ -13,5 +13,6 @@ def get_meetings(days: int = 7) -> Sequence[Meeting]:
1313
meetings = Meeting.scan(
1414
A.date >= target_date,
1515
)
16+
meetings_list = [m for m in meetings if (video is None or bool(m.video) == video)]
1617

17-
return list(meetings)
18+
return list(meetings_list)

flows/add_subtitles.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
from prefect import flow
23
from dyntastic import A
34
from pydantic import HttpUrl
@@ -11,30 +12,41 @@
1112

1213

1314
@flow(log_prints=True)
14-
async def add_subtitles():
15+
def add_subtitles():
16+
print("Getting meetings")
1517
meetings = get_meetings(days=90)
1618
meetings_with_transcripts = [
1719
meeting
1820
for meeting in meetings
1921
if hasattr(meeting, "transcripts") and meeting.transcripts is not None or []
2022
]
23+
print(f"Found {len(meetings_with_transcripts)} meetings with transcripts")
2124
for meeting in meetings_with_transcripts:
2225
for transcript_url in meeting.transcripts:
23-
transcript_data = await async_get_json(transcript_url.encoded_string())
26+
print(f"Processing {transcript_url}")
27+
transcript_data = asyncio.run(
28+
async_get_json(transcript_url.encoded_string())
29+
)
30+
print(f"Transcript data: {transcript_data}")
2431
transcript = Transcript.model_validate(transcript_data)
2532
language = transcript.language
2633
if f"{language}.vtt" in meeting.subtitles:
2734
continue
28-
track_content = await create_vtt_track(
29-
transcript,
30-
include_speaker_prefix=False,
35+
print(f"Creating VTT track for {language}")
36+
track_content = asyncio.run(
37+
create_vtt_track(
38+
transcript,
39+
include_speaker_prefix=False,
40+
)
3141
)
42+
print(f"Saving VTT track to S3")
3243
result: HttpUrl = save_content_to_s3(
3344
track_content,
3445
"tgov-assets",
3546
f"{meeting.filename()}/subtitles/{language}.vtt",
3647
"text/vtt",
3748
)
49+
print(f"VTT track saved to S3")
3850
if not meeting.subtitles:
3951
meeting.subtitles = [result]
4052
else:
@@ -43,10 +55,11 @@ async def add_subtitles():
4355
if result not in meeting.subtitles
4456
else None
4557
)
58+
print(f"Saving meeting to DynamoDB")
4659
meeting.save()
60+
print(f"Meeting saved to DynamoDB")
4761

4862

4963
if __name__ == "__main__":
50-
import asyncio
51-
52-
asyncio.run(add_subtitles())
64+
print("Starting add_subtitles")
65+
add_subtitles()

flows/translate_meetings.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
11
from prefect import flow
22

3-
from tasks.meetings import get_new_meetings
3+
from db.queries import get_meetings
4+
from tasks.diarize import diarize_meeting
5+
from tasks.meetings import register_meetings
46

57

68
@flow(log_prints=True)
7-
async def translate_meetings():
8-
new_meetings = await get_new_meetings()
9-
# new_transcribed_meetings = await transcribe_videos(new_meetings)
9+
def translate_meetings():
10+
new_meetings = register_meetings()
11+
print(f"Registered {len(new_meetings)} new meetings")
12+
meetings_to_diarize = get_meetings(video=True)
13+
print(f"Found {len(meetings_to_diarize)} meetings to diarize")
14+
for meeting in meetings_to_diarize:
15+
diarize_meeting(meeting)
1016
# new_subtitled_video_pages = await create_subtitled_video_pages(new_transcribed_meetings)
1117
# new_translated_meetings = await translate_transcriptions(new_transcribed_meetings)
1218

19+
1320
if __name__ == "__main__":
14-
import asyncio
15-
asyncio.run(translate_meetings())
21+
translate_meetings()

src/aws.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77

88
def is_aws_configured():
99
required_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]
10-
return all(var in os.environ for var in required_vars)
10+
return True
11+
# return all(var in os.environ for var in required_vars)
1112

1213

1314
s3_client = boto3.client("s3")

src/local_store.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,22 @@
11
import json
22
import os
3-
from typing import Sequence
3+
from typing import List
44
from src.models.meeting import Meeting
55

66
LOCAL_STORE_PATH = "data/meetings.json"
77

8-
def read_meetings() -> Sequence[Meeting]:
8+
9+
def read_meetings() -> List[Meeting]:
910
if not os.path.exists(LOCAL_STORE_PATH):
1011
return []
1112

12-
with open(LOCAL_STORE_PATH, 'r') as f:
13+
with open(LOCAL_STORE_PATH, "r") as f:
1314
data = json.load(f)
1415
return [Meeting(**meeting) for meeting in data]
1516

16-
def write_meetings(meetings: Sequence[Meeting]):
17+
18+
def write_meetings(meetings: List[Meeting]):
1719
os.makedirs(os.path.dirname(LOCAL_STORE_PATH), exist_ok=True)
18-
with open(LOCAL_STORE_PATH, 'w') as f:
20+
with open(LOCAL_STORE_PATH, "w") as f:
1921
json_data = [meeting.model_dump_json() for meeting in meetings]
2022
json.dump(json_data, f)

src/meetings.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"""
88

99
import re
10-
from typing import Dict, List, Sequence
10+
from typing import List
1111
from urllib.parse import urljoin
1212

1313
import aiohttp
@@ -127,7 +127,7 @@ async def parse_meetings(html: str) -> List[Meeting]:
127127
return meetings
128128

129129

130-
async def get_tgov_meetings() -> Sequence[Meeting]:
130+
async def get_tgov_meetings() -> List[Meeting]:
131131
"""
132132
Fetch and parse meeting data from the Government Access Television website.
133133
@@ -165,7 +165,7 @@ def duration_to_minutes(duration):
165165
return None
166166

167167

168-
def get_registry_meetings() -> Sequence[Meeting]:
168+
def get_registry_meetings() -> List[Meeting]:
169169
if is_aws_configured():
170170
print(f"Getting registry from DynamoDB.")
171171
return list(Meeting.scan())
@@ -174,7 +174,7 @@ def get_registry_meetings() -> Sequence[Meeting]:
174174
return read_meetings()
175175

176176

177-
def write_registry_meetings(meetings: Sequence[Meeting]) -> Sequence[Meeting]:
177+
def write_registry_meetings(meetings: List[Meeting]) -> List[Meeting]:
178178
if is_aws_configured():
179179
print(f"Writing registry to DynamoDB.")
180180
with Meeting.batch_writer():

src/models/meeting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from dyntastic import Dyntastic
88
from pydantic import BaseModel, Field, HttpUrl
99
from datetime import datetime
10-
from typing import Sequence, List
10+
from typing import List
1111

1212

1313
def clean_filename(meeting_name: str) -> str:

src/models/utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,7 @@ def from_jsonl(jsonl_str: str, model_class: Type[T]) -> Sequence[T]:
3434
Returns:
3535
A list of instances of the specified Pydantic model class.
3636
"""
37-
return [model_class.model_validate(json.loads(line)) for line in jsonl_str.strip().splitlines()]
37+
return [
38+
model_class.model_validate(json.loads(line))
39+
for line in jsonl_str.strip().splitlines()
40+
]

0 commit comments

Comments
 (0)