1818from src .local_store import read_meetings , write_meetings
1919
2020from .models .meeting import Meeting
21+ from datetime import datetime
22+ import dateparser
2123
2224BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
2325TGOV_BUCKET_NAME = "tgov-meetings"
2426MEETINGS_REGISTRY_PATH = "data/meetings.jsonl"
2527
28+
2629async def fetch_page (url : str , session : aiohttp .ClientSession ) -> str :
2730 """
2831 Fetch the HTML content of a page.
@@ -39,19 +42,8 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
3942 raise Exception (f"Failed to fetch { url } , status code: { response .status } " )
4043 return await response .text ()
4144
42- def clean_date (date : str ) -> str :
43- return re .sub (r"\s+" , " " , date ).strip ()
44-
45- async def parse_meetings (html : str ) -> List [Dict [str , str ]]:
46- """
47- Parse the meeting data from the HTML content.
4845
49- Args:
50- html: The HTML content of the page
51-
52- Returns:
53- A list of dictionaries containing meeting data
54- """
46+ async def parse_meetings (html : str ) -> List [Meeting ]:
5547 parser = HTMLParser (html )
5648
5749 # Find all tables with meeting data
@@ -67,14 +59,19 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
6759 name_cells = row .css ('td.listItem[headers^="Name"]' )
6860 meeting_name = name_cells [0 ].text ().strip () if name_cells else "Unknown"
6961
70- date_cells = row .css ('td.listItem[headers^="Date"]' )
71- raw_date = clean_date (date_cells [0 ].text ().strip ()) if date_cells else "Unknown"
72- meeting_date = raw_date .split ("-" )[0 ].strip () if "-" in raw_date else raw_date
62+ date_cell = row .css_first ('td.listItem[headers^="Date"]' )
63+ meeting_date = dateparser .parse (date_cell .text ())
7364
7465 duration_cells = row .css ('td.listItem[headers^="Duration"]' )
75- duration_str = duration_cells [0 ].text ().strip () if duration_cells else "Unknown"
66+ duration_str = (
67+ duration_cells [0 ].text ().strip () if duration_cells else "Unknown"
68+ )
7669 minutes = duration_to_minutes (duration_str )
77- meeting_duration = f"{ minutes // 60 } :{ minutes % 60 :02d} " if minutes is not None else "Unknown"
70+ meeting_duration = (
71+ f"{ minutes // 60 } :{ minutes % 60 :02d} "
72+ if minutes is not None
73+ else "Unknown"
74+ )
7875
7976 meeting_data = {
8077 "meeting" : meeting_name ,
@@ -100,7 +97,9 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
10097 video_link = video_cell .css_first ("a" )
10198
10299 onclick = video_link .attributes .get ("onclick" , "" )
103- onclick_match = re .search (r"window\.open\(['\"](//[^'\"]+)['\"]" , onclick )
100+ onclick_match = re .search (
101+ r"window\.open\(['\"](//[^'\"]+)['\"]" , onclick
102+ )
104103 clip_id_exp = r"clip_id=(\d+)"
105104
106105 if onclick_match :
@@ -117,14 +116,17 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
117116 if clip_id_match :
118117 clip_id = clip_id_match .group (1 )
119118 meeting_data ["clip_id" ] = clip_id
120- meeting_data ["video" ] = f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={ clip_id } "
119+ meeting_data ["video" ] = (
120+ f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={ clip_id } "
121+ )
121122 else :
122123 meeting_data ["video" ] = urljoin (BASE_URL , href )
123124
124- meetings .append (meeting_data )
125+ meetings .append (Meeting ( ** meeting_data ) )
125126
126127 return meetings
127128
129+
128130async def get_tgov_meetings () -> Sequence [Meeting ]:
129131 """
130132 Fetch and parse meeting data from the Government Access Television website.
@@ -134,12 +136,10 @@ async def get_tgov_meetings() -> Sequence[Meeting]:
134136 """
135137 async with aiohttp .ClientSession () as session :
136138 html = await fetch_page (BASE_URL , session )
137- meeting_dicts = await parse_meetings (html )
138-
139- # Convert dictionaries to Meeting objects
140- meetings = [Meeting (** meeting_dict ) for meeting_dict in meeting_dicts ]
139+ meetings = await parse_meetings (html )
141140 return meetings
142141
142+
143143def duration_to_minutes (duration ):
144144 if not duration or pd .isna (duration ):
145145 return None
@@ -149,40 +149,42 @@ def duration_to_minutes(duration):
149149 hours = 0
150150 minutes = 0
151151
152- if 'h' in duration :
153- hours_part = duration .split ('h' )[0 ].strip ()
152+ if "h" in duration :
153+ hours_part = duration .split ("h" )[0 ].strip ()
154154 hours = int (hours_part )
155155
156- if 'm' in duration :
157- if 'h' in duration :
158- minutes_part = duration .split ('h' )[1 ].split ('m' )[0 ].strip ()
156+ if "m" in duration :
157+ if "h" in duration :
158+ minutes_part = duration .split ("h" )[1 ].split ("m" )[0 ].strip ()
159159 else :
160- minutes_part = duration .split ('m' )[0 ].strip ()
160+ minutes_part = duration .split ("m" )[0 ].strip ()
161161 minutes = int (minutes_part )
162162
163163 return hours * 60 + minutes
164164 except :
165165 return None
166166
167+
167168def get_registry_meetings () -> Sequence [Meeting ]:
168169 if is_aws_configured ():
169- print (f' Getting registry from DynamoDB.' )
170+ print (f" Getting registry from DynamoDB." )
170171 return list (Meeting .scan ())
171172 else :
172- print (f' Getting registry from local store' )
173+ print (f" Getting registry from local store" )
173174 return read_meetings ()
174175
176+
175177def write_registry_meetings (meetings : Sequence [Meeting ]) -> Sequence [Meeting ]:
176178 if is_aws_configured ():
177- print (f' Writing registry to DynamoDB.' )
179+ print (f" Writing registry to DynamoDB." )
178180 with Meeting .batch_writer ():
179181 for meeting in meetings :
180182 if meeting .clip_id :
181183 meeting .save ()
182184 else :
183- print (f' Skipping meeting with missing clip_id: { meeting } ' )
185+ print (f" Skipping meeting with missing clip_id: { meeting } " )
184186 else :
185- print (f' Writing registry to local store' )
187+ print (f" Writing registry to local store" )
186188 write_meetings (meetings )
187189
188190 return meetings
0 commit comments