Skip to content

Commit df456e5

Browse files
committed
change Meeting date to datetime
1 parent 8b6527f commit df456e5

File tree

3 files changed

+40
-36
lines changed

3 files changed

+40
-36
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ faster-whisper = "^1.1.1"
2828
prefect = "^3.3.0"
2929
boto3 = "^1.37.24"
3030
dyntastic = "^0.18.0"
31+
dateparser = "^1.2.1"
3132

3233

3334
[tool.poetry.group.dev.dependencies]

src/meetings.py

Lines changed: 37 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,14 @@
1818
from src.local_store import read_meetings, write_meetings
1919

2020
from .models.meeting import Meeting
21+
from datetime import datetime
22+
import dateparser
2123

2224
BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
2325
TGOV_BUCKET_NAME = "tgov-meetings"
2426
MEETINGS_REGISTRY_PATH = "data/meetings.jsonl"
2527

28+
2629
async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
2730
"""
2831
Fetch the HTML content of a page.
@@ -39,19 +42,8 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
3942
raise Exception(f"Failed to fetch {url}, status code: {response.status}")
4043
return await response.text()
4144

42-
def clean_date(date: str) -> str:
43-
return re.sub(r"\s+", " ", date).strip()
44-
45-
async def parse_meetings(html: str) -> List[Dict[str, str]]:
46-
"""
47-
Parse the meeting data from the HTML content.
4845

49-
Args:
50-
html: The HTML content of the page
51-
52-
Returns:
53-
A list of dictionaries containing meeting data
54-
"""
46+
async def parse_meetings(html: str) -> List[Meeting]:
5547
parser = HTMLParser(html)
5648

5749
# Find all tables with meeting data
@@ -67,14 +59,19 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
6759
name_cells = row.css('td.listItem[headers^="Name"]')
6860
meeting_name = name_cells[0].text().strip() if name_cells else "Unknown"
6961

70-
date_cells = row.css('td.listItem[headers^="Date"]')
71-
raw_date = clean_date(date_cells[0].text().strip()) if date_cells else "Unknown"
72-
meeting_date = raw_date.split("-")[0].strip() if "-" in raw_date else raw_date
62+
date_cell = row.css_first('td.listItem[headers^="Date"]')
63+
meeting_date = dateparser.parse(date_cell.text())
7364

7465
duration_cells = row.css('td.listItem[headers^="Duration"]')
75-
duration_str = duration_cells[0].text().strip() if duration_cells else "Unknown"
66+
duration_str = (
67+
duration_cells[0].text().strip() if duration_cells else "Unknown"
68+
)
7669
minutes = duration_to_minutes(duration_str)
77-
meeting_duration = f"{minutes // 60}:{minutes % 60:02d}" if minutes is not None else "Unknown"
70+
meeting_duration = (
71+
f"{minutes // 60}:{minutes % 60:02d}"
72+
if minutes is not None
73+
else "Unknown"
74+
)
7875

7976
meeting_data = {
8077
"meeting": meeting_name,
@@ -100,7 +97,9 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
10097
video_link = video_cell.css_first("a")
10198

10299
onclick = video_link.attributes.get("onclick", "")
103-
onclick_match = re.search(r"window\.open\(['\"](//[^'\"]+)['\"]", onclick)
100+
onclick_match = re.search(
101+
r"window\.open\(['\"](//[^'\"]+)['\"]", onclick
102+
)
104103
clip_id_exp = r"clip_id=(\d+)"
105104

106105
if onclick_match:
@@ -117,14 +116,17 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
117116
if clip_id_match:
118117
clip_id = clip_id_match.group(1)
119118
meeting_data["clip_id"] = clip_id
120-
meeting_data["video"] = f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={clip_id}"
119+
meeting_data["video"] = (
120+
f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={clip_id}"
121+
)
121122
else:
122123
meeting_data["video"] = urljoin(BASE_URL, href)
123124

124-
meetings.append(meeting_data)
125+
meetings.append(Meeting(**meeting_data))
125126

126127
return meetings
127128

129+
128130
async def get_tgov_meetings() -> Sequence[Meeting]:
129131
"""
130132
Fetch and parse meeting data from the Government Access Television website.
@@ -134,12 +136,10 @@ async def get_tgov_meetings() -> Sequence[Meeting]:
134136
"""
135137
async with aiohttp.ClientSession() as session:
136138
html = await fetch_page(BASE_URL, session)
137-
meeting_dicts = await parse_meetings(html)
138-
139-
# Convert dictionaries to Meeting objects
140-
meetings = [Meeting(**meeting_dict) for meeting_dict in meeting_dicts]
139+
meetings = await parse_meetings(html)
141140
return meetings
142141

142+
143143
def duration_to_minutes(duration):
144144
if not duration or pd.isna(duration):
145145
return None
@@ -149,40 +149,42 @@ def duration_to_minutes(duration):
149149
hours = 0
150150
minutes = 0
151151

152-
if 'h' in duration:
153-
hours_part = duration.split('h')[0].strip()
152+
if "h" in duration:
153+
hours_part = duration.split("h")[0].strip()
154154
hours = int(hours_part)
155155

156-
if 'm' in duration:
157-
if 'h' in duration:
158-
minutes_part = duration.split('h')[1].split('m')[0].strip()
156+
if "m" in duration:
157+
if "h" in duration:
158+
minutes_part = duration.split("h")[1].split("m")[0].strip()
159159
else:
160-
minutes_part = duration.split('m')[0].strip()
160+
minutes_part = duration.split("m")[0].strip()
161161
minutes = int(minutes_part)
162162

163163
return hours * 60 + minutes
164164
except:
165165
return None
166166

167+
167168
def get_registry_meetings() -> Sequence[Meeting]:
168169
if is_aws_configured():
169-
print(f'Getting registry from DynamoDB.')
170+
print(f"Getting registry from DynamoDB.")
170171
return list(Meeting.scan())
171172
else:
172-
print(f'Getting registry from local store')
173+
print(f"Getting registry from local store")
173174
return read_meetings()
174175

176+
175177
def write_registry_meetings(meetings: Sequence[Meeting]) -> Sequence[Meeting]:
176178
if is_aws_configured():
177-
print(f'Writing registry to DynamoDB.')
179+
print(f"Writing registry to DynamoDB.")
178180
with Meeting.batch_writer():
179181
for meeting in meetings:
180182
if meeting.clip_id:
181183
meeting.save()
182184
else:
183-
print(f'Skipping meeting with missing clip_id: {meeting}')
185+
print(f"Skipping meeting with missing clip_id: {meeting}")
184186
else:
185-
print(f'Writing registry to local store')
187+
print(f"Writing registry to local store")
186188
write_meetings(meetings)
187189

188190
return meetings

src/models/meeting.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from dyntastic import Dyntastic
88
from pydantic import BaseModel, Field, HttpUrl
9+
from datetime import datetime
910

1011

1112
class Meeting(Dyntastic):
@@ -18,7 +19,7 @@ class Meeting(Dyntastic):
1819

1920
clip_id: Optional[str] = Field(None, description="Granicus clip ID")
2021
meeting: str = Field(description="Name of the meeting")
21-
date: str = Field(description="Date and time of the meeting")
22+
date: datetime = Field(description="Date and time of the meeting")
2223
duration: str = Field(description="Duration of the meeting")
2324
agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
2425
video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")

0 commit comments

Comments
 (0)