66Television websites.
77"""
88
9- from typing import Dict , List
9+ import re
10+ from typing import Dict , List , Sequence
1011from urllib .parse import urljoin
1112
1213import aiohttp
1314import pandas as pd
1415from selectolax .parser import HTMLParser
1516
17+ from src .aws import is_aws_configured
18+ from src .models .utils import from_jsonl , to_jsonl
19+
1620from .models .meeting import Meeting
1721
1822BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
23+ TGOV_BUCKET_NAME = "tgov-meetings"
24+ MEETINGS_REGISTRY_PATH = "data/meetings.jsonl"
1925
2026
2127async def fetch_page (url : str , session : aiohttp .ClientSession ) -> str :
@@ -35,6 +41,10 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
3541 return await response .text ()
3642
3743
44+ def clean_date (date : str ) -> str :
45+ return re .sub (r"\s+" , "" , date )
46+
47+
3848async def parse_meetings (html : str ) -> List [Dict [str , str ]]:
3949 """
4050 Parse the meeting data from the HTML content.
@@ -69,9 +79,10 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
6979
7080 meeting_data = {
7181 "meeting" : cells [0 ].text ().strip (),
72- "date" : cells [1 ].text ().strip (),
82+ "date" : clean_date ( cells [1 ].text ().strip () ),
7383 "duration" : cells [2 ].text ().strip (),
7484 "agenda" : None ,
85+ "clip_id" : None ,
7586 "video" : None ,
7687 }
7788
@@ -86,37 +97,22 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
8697 # Extract video link if available
8798 video_cell = cells [4 ]
8899 video_link = video_cell .css_first ("a" )
89- if video_link :
90- # First try to extract from onclick attribute
91- onclick = video_link .attributes .get ("onclick" , "" )
92- if onclick :
93- # Look for window.open pattern
94- if "window.open(" in onclick :
95- # Extract URL from window.open('URL', ...)
96- start_quote = onclick .find ("'" , onclick .find ("window.open(" ))
97- end_quote = onclick .find ("'" , start_quote + 1 )
98- if start_quote > 0 and end_quote > start_quote :
99- video_url = onclick [start_quote + 1 : end_quote ]
100- # Handle protocol-relative URLs (starting with //)
101- if video_url .startswith ("//" ):
102- video_url = f"https:{ video_url } "
103- meeting_data ["video" ] = video_url
104-
105- # If onclick extraction failed, try href
106- if meeting_data ["video" ] is None and video_link .attributes .get ("href" ):
107- href = video_link .attributes .get ("href" )
108- # Handle javascript: hrefs
109- if href .startswith ("javascript:" ):
110- # Try to extract clip_id from the onclick attribute again
111- # This handles cases where href is javascript:void(0) but onclick has the real URL
112- if meeting_data ["video" ] is None and "clip_id=" in onclick :
113- start_idx = onclick .find ("clip_id=" )
114- end_idx = onclick .find ("'" , start_idx )
115- if start_idx > 0 and end_idx > start_idx :
116- clip_id = onclick [start_idx + 8 : end_idx ]
117- meeting_data ["video" ] = (
118- f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={ clip_id } "
119- )
100+ onclick = video_link .attributes .get ("onclick" , "" )
101+ onclick_match = re .search (r"window\.open\(['\"](//[^'\"]+)['\"]" , onclick )
102+ clip_id_exp = r"clip_id=(\d+)"
103+
104+ if onclick_match :
105+ meeting_data ["video" ] = f"https:{ onclick_match .group (1 )} "
106+ meeting_data ["clip_id" ] = re .search (clip_id_exp , onclick ).group (1 )
107+
108+ if not meeting_data ["video" ]:
109+ href = video_link .attributes .get ("href" , "" )
110+ if href .startswith ("javascript:" ):
111+ clip_id_match = re .search (clip_id_exp , href )
112+ if clip_id_match :
113+ clip_id = clip_id_match .group (1 )
114+ meeting_data ["clip_id" ] = clip_id
115+ meeting_data ["video" ] = f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={ clip_id } "
120116 else :
121117 meeting_data ["video" ] = urljoin (BASE_URL , href )
122118
@@ -125,7 +121,7 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
125121 return meetings
126122
127123
128- async def get_meetings () -> List [Meeting ]:
124+ async def get_tgov_meetings () -> Sequence [Meeting ]:
129125 """
130126 Fetch and parse meeting data from the Government Access Television website.
131127
@@ -164,3 +160,44 @@ def duration_to_minutes(duration):
164160 return hours * 60 + minutes
165161 except :
166162 return None
163+
164+
165+ def get_registry_meetings () -> Sequence [Meeting ]:
166+ if is_aws_configured ():
167+ print (f'Getting registry from AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
168+ import boto3
169+ from botocore .exceptions import ClientError
170+ s3 = boto3 .client ('s3' )
171+ try :
172+ registry_response = s3 .get_object (Bucket = TGOV_BUCKET_NAME , Key = MEETINGS_REGISTRY_PATH )
173+ registry_body = registry_response ['Body' ].read ().decode ('utf-8' )
174+ return from_jsonl (registry_body , Meeting )
175+ except ClientError as e :
176+ if e .response ['Error' ]['Code' ] == 'NoSuchKey' :
177+ print ('No registry file found on S3. Returning empty list.' )
178+
179+ return []
180+
181+
182+ def write_registry_meetings (meetings : Sequence [Meeting ]) -> Sequence [Meeting ]:
183+ jsonl_str = to_jsonl (meetings )
184+
185+ if is_aws_configured ():
186+ print (f'Writing registry to AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
187+ import boto3
188+ from botocore .exceptions import ClientError
189+ s3 = boto3 .client ('s3' )
190+
191+ try :
192+ s3 .put_object (
193+ Bucket = TGOV_BUCKET_NAME ,
194+ Key = MEETINGS_REGISTRY_PATH ,
195+ Body = jsonl_str ,
196+ ContentType = 'application/x-ndjson'
197+ )
198+ print (f'Wrote { len (meetings )} meetings to S3.' )
199+ except ClientError as e :
200+ print (f"Failed to write to S3: { e } " )
201+ raise
202+
203+ return meetings
0 commit comments