Skip to content

Commit e549230

Browse files
authored
Merge pull request #23 from codefortulsa/meeting-datetime
Change Meeting date to datetime
2 parents d60c807 + 5165935 commit e549230

File tree

4 files changed

+62
-51
lines changed

4 files changed

+62
-51
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ faster-whisper = "^1.1.1"
2929
prefect = "^3.3.0"
3030
boto3 = "^1.37.24"
3131
dyntastic = "^0.18.0"
32+
dateparser = "^1.2.1"
3233

3334

3435
[tool.poetry.group.dev.dependencies]

src/meetings.py

Lines changed: 37 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,14 @@
1818
from src.local_store import read_meetings, write_meetings
1919

2020
from .models.meeting import Meeting
21+
from datetime import datetime
22+
import dateparser
2123

2224
BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
2325
TGOV_BUCKET_NAME = "tgov-meetings"
2426
MEETINGS_REGISTRY_PATH = "data/meetings.jsonl"
2527

28+
2629
async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
2730
"""
2831
Fetch the HTML content of a page.
@@ -39,19 +42,8 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
3942
raise Exception(f"Failed to fetch {url}, status code: {response.status}")
4043
return await response.text()
4144

42-
def clean_date(date: str) -> str:
43-
return re.sub(r"\s+", " ", date).strip()
44-
45-
async def parse_meetings(html: str) -> List[Dict[str, str]]:
46-
"""
47-
Parse the meeting data from the HTML content.
4845

49-
Args:
50-
html: The HTML content of the page
51-
52-
Returns:
53-
A list of dictionaries containing meeting data
54-
"""
46+
async def parse_meetings(html: str) -> List[Meeting]:
5547
parser = HTMLParser(html)
5648

5749
# Find all tables with meeting data
@@ -67,14 +59,19 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
6759
name_cells = row.css('td.listItem[headers^="Name"]')
6860
meeting_name = name_cells[0].text().strip() if name_cells else "Unknown"
6961

70-
date_cells = row.css('td.listItem[headers^="Date"]')
71-
raw_date = clean_date(date_cells[0].text().strip()) if date_cells else "Unknown"
72-
meeting_date = raw_date.split("-")[0].strip() if "-" in raw_date else raw_date
62+
date_cell = row.css_first('td.listItem[headers^="Date"]')
63+
meeting_date = dateparser.parse(date_cell.text())
7364

7465
duration_cells = row.css('td.listItem[headers^="Duration"]')
75-
duration_str = duration_cells[0].text().strip() if duration_cells else "Unknown"
66+
duration_str = (
67+
duration_cells[0].text().strip() if duration_cells else "Unknown"
68+
)
7669
minutes = duration_to_minutes(duration_str)
77-
meeting_duration = f"{minutes // 60}:{minutes % 60:02d}" if minutes is not None else "Unknown"
70+
meeting_duration = (
71+
f"{minutes // 60}:{minutes % 60:02d}"
72+
if minutes is not None
73+
else "Unknown"
74+
)
7875

7976
meeting_data = {
8077
"meeting": meeting_name,
@@ -100,7 +97,9 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
10097
video_link = video_cell.css_first("a")
10198

10299
onclick = video_link.attributes.get("onclick", "")
103-
onclick_match = re.search(r"window\.open\(['\"](//[^'\"]+)['\"]", onclick)
100+
onclick_match = re.search(
101+
r"window\.open\(['\"](//[^'\"]+)['\"]", onclick
102+
)
104103
clip_id_exp = r"clip_id=(\d+)"
105104

106105
if onclick_match:
@@ -117,14 +116,17 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
117116
if clip_id_match:
118117
clip_id = clip_id_match.group(1)
119118
meeting_data["clip_id"] = clip_id
120-
meeting_data["video"] = f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={clip_id}"
119+
meeting_data["video"] = (
120+
f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={clip_id}"
121+
)
121122
else:
122123
meeting_data["video"] = urljoin(BASE_URL, href)
123124

124-
meetings.append(meeting_data)
125+
meetings.append(Meeting(**meeting_data))
125126

126127
return meetings
127128

129+
128130
async def get_tgov_meetings() -> Sequence[Meeting]:
129131
"""
130132
Fetch and parse meeting data from the Government Access Television website.
@@ -134,12 +136,10 @@ async def get_tgov_meetings() -> Sequence[Meeting]:
134136
"""
135137
async with aiohttp.ClientSession() as session:
136138
html = await fetch_page(BASE_URL, session)
137-
meeting_dicts = await parse_meetings(html)
138-
139-
# Convert dictionaries to Meeting objects
140-
meetings = [Meeting(**meeting_dict) for meeting_dict in meeting_dicts]
139+
meetings = await parse_meetings(html)
141140
return meetings
142141

142+
143143
def duration_to_minutes(duration):
144144
if not duration or pd.isna(duration):
145145
return None
@@ -149,40 +149,42 @@ def duration_to_minutes(duration):
149149
hours = 0
150150
minutes = 0
151151

152-
if 'h' in duration:
153-
hours_part = duration.split('h')[0].strip()
152+
if "h" in duration:
153+
hours_part = duration.split("h")[0].strip()
154154
hours = int(hours_part)
155155

156-
if 'm' in duration:
157-
if 'h' in duration:
158-
minutes_part = duration.split('h')[1].split('m')[0].strip()
156+
if "m" in duration:
157+
if "h" in duration:
158+
minutes_part = duration.split("h")[1].split("m")[0].strip()
159159
else:
160-
minutes_part = duration.split('m')[0].strip()
160+
minutes_part = duration.split("m")[0].strip()
161161
minutes = int(minutes_part)
162162

163163
return hours * 60 + minutes
164164
except:
165165
return None
166166

167+
167168
def get_registry_meetings() -> Sequence[Meeting]:
168169
if is_aws_configured():
169-
print(f'Getting registry from DynamoDB.')
170+
print(f"Getting registry from DynamoDB.")
170171
return list(Meeting.scan())
171172
else:
172-
print(f'Getting registry from local store')
173+
print(f"Getting registry from local store")
173174
return read_meetings()
174175

176+
175177
def write_registry_meetings(meetings: Sequence[Meeting]) -> Sequence[Meeting]:
176178
if is_aws_configured():
177-
print(f'Writing registry to DynamoDB.')
179+
print(f"Writing registry to DynamoDB.")
178180
with Meeting.batch_writer():
179181
for meeting in meetings:
180182
if meeting.clip_id:
181183
meeting.save()
182184
else:
183-
print(f'Skipping meeting with missing clip_id: {meeting}')
185+
print(f"Skipping meeting with missing clip_id: {meeting}")
184186
else:
185-
print(f'Writing registry to local store')
187+
print(f"Writing registry to local store")
186188
write_meetings(meetings)
187189

188190
return meetings

src/models/meeting.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from dyntastic import Dyntastic
88
from pydantic import BaseModel, Field, HttpUrl
9+
from datetime import datetime
910

1011

1112
class Meeting(Dyntastic):
@@ -18,7 +19,7 @@ class Meeting(Dyntastic):
1819

1920
clip_id: Optional[str] = Field(None, description="Granicus clip ID")
2021
meeting: str = Field(description="Name of the meeting")
21-
date: str = Field(description="Date and time of the meeting")
22+
date: datetime = Field(description="Date and time of the meeting")
2223
duration: str = Field(description="Duration of the meeting")
2324
agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
2425
video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")

tests/test_meetings.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
from pathlib import Path
1010
import pytest
1111
from unittest.mock import patch, MagicMock, AsyncMock
12-
12+
from typing import List
1313
import sys
14+
from datetime import datetime
1415

1516
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
1617

@@ -99,33 +100,39 @@ async def test_parse_meetings(sample_html):
99100

100101
assert len(meetings) == 2
101102

102-
assert meetings[0]["meeting"] == "Regular Council Meeting"
103-
assert meetings[0]["date"] == "April 2, 2025"
104-
assert meetings[0]["duration"] == "1:29"
105-
assert "AgendaViewer.php?view_id=4&clip_id=6515" in meetings[0]["agenda"]
106-
assert "MediaPlayer.php?view_id=4&clip_id=6515" in meetings[0]["video"]
103+
mtg_one = meetings[0]
104+
105+
assert mtg_one.meeting == "Regular Council Meeting"
106+
assert mtg_one.date == datetime(2025, 4, 2, 17, 3)
107+
assert mtg_one.duration == "1:29"
108+
assert "AgendaViewer.php?view_id=4&clip_id=6515" in mtg_one.agenda.encoded_string()
109+
assert "MediaPlayer.php?view_id=4&clip_id=6515" in mtg_one.video.encoded_string()
110+
111+
mtg_two = meetings[1]
107112

108-
assert meetings[1]["meeting"] == "Animal Welfare Commission"
109-
assert meetings[1]["date"] == "March 10, 2025"
110-
assert meetings[1]["duration"] == "0:38"
111-
assert meetings[1]["agenda"] is None
112-
assert "MediaPlayer.php?view_id=4&clip_id=6474" in meetings[1]["video"]
113+
assert mtg_two.meeting == "Animal Welfare Commission"
114+
assert mtg_two.date == datetime(2025, 3, 10, 18, 0)
115+
assert mtg_two.duration == "0:38"
116+
assert mtg_two.agenda is None
117+
assert "MediaPlayer.php?view_id=4&clip_id=6474" in mtg_two.video.encoded_string()
113118

114119

115120
@pytest.mark.asyncio
116121
async def test_parse_real_html(real_html):
117122
"""Test that meetings are correctly parsed from real HTML"""
118-
meetings = await parse_meetings(real_html)
123+
meetings: List[Meeting] = await parse_meetings(real_html)
119124

120125
# Basic validation
121126
assert isinstance(meetings, list)
122127
assert len(meetings) > 0
123128

124129
# Check that each meeting has the expected fields
130+
# this is now overkill since pydantic handles this
125131
for meeting in meetings:
126-
assert "meeting" in meeting
127-
assert "date" in meeting
128-
assert "duration" in meeting
132+
assert isinstance(meeting, Meeting)
133+
assert hasattr(meeting, "meeting")
134+
assert hasattr(meeting, "date")
135+
assert hasattr(meeting, "duration")
129136
# Agenda and video may be None for some meetings
130137

131138

0 commit comments

Comments
 (0)