tgov-scraper/functions/meetings.py at 4fc72408eca61fc2bf80b7112cc9a18539e32b20 · codefortulsa/tgov-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python3
"""
Government Access Television Meeting Scraper

This module provides functions to scrape meeting data from Government Access
Television websites.
"""

import asyncio
import json
from typing import Dict, List, Optional, Any
from urllib.parse import urljoin

import aiohttp
from selectolax.parser import HTMLParser

from .models.meeting import Meeting

BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"


async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
    """
    Fetch the HTML content of a page.

    Args:
        url: The URL to fetch
        session: An aiohttp ClientSession

    Returns:
        The HTML content as a string
    """
    async with session.get(url) as response:
        if response.status != 200:
            raise Exception(f"Failed to fetch {url}, status code: {response.status}")
        return await response.text()


async def parse_meetings(html: str) -> List[Dict[str, str]]:
    """
    Parse the meeting data from the HTML content.

    Args:
        html: The HTML content of the page

    Returns:
        A list of dictionaries containing meeting data
    """
    parser = HTMLParser(html)

    # Find all tables with meeting data
    tables = parser.css("table.listingTable")
    if not tables:
        return []

    meetings = []

    # Process each table
    for table in tables:
        # Find the tbody section which contains the actual meeting rows
        tbody = table.css_first("tbody")
        if not tbody:
            continue

        # Process each row in the tbody
        for row in tbody.css("tr"):
            cells = row.css("td")
            if len(cells) < 5:
                continue

            meeting_data = {
                "meeting": cells[0].text().strip(),
                "date": cells[1].text().strip(),
                "duration": cells[2].text().strip(),
                "agenda": None,
                "video": None,
            }

            # Extract agenda link if available
            agenda_cell = cells[3]
            agenda_link = agenda_cell.css_first("a")
            if agenda_link and agenda_link.attributes.get("href"):
                meeting_data["agenda"] = urljoin(
                    BASE_URL, agenda_link.attributes.get("href")
                )

            # Extract video link if available
            video_cell = cells[4]
            video_link = video_cell.css_first("a")
            if video_link:
                # First try to extract from onclick attribute
                onclick = video_link.attributes.get("onclick", "")
                if onclick:
                    # Look for window.open pattern
                    if "window.open(" in onclick:
                        # Extract URL from window.open('URL', ...)
                        start_quote = onclick.find("'", onclick.find("window.open("))
                        end_quote = onclick.find("'", start_quote + 1)
                        if start_quote > 0 and end_quote > start_quote:
                            video_url = onclick[start_quote + 1 : end_quote]
                            # Handle protocol-relative URLs (starting with //)
                            if video_url.startswith("//"):
                                video_url = f"https:{video_url}"
                            meeting_data["video"] = video_url

                # If onclick extraction failed, try href
                if meeting_data["video"] is None and video_link.attributes.get("href"):
                    href = video_link.attributes.get("href")
                    # Handle javascript: hrefs
                    if href.startswith("javascript:"):
                        # Try to extract clip_id from the onclick attribute again
                        # This handles cases where href is javascript:void(0) but onclick has the real URL
                        if meeting_data["video"] is None and "clip_id=" in onclick:
                            start_idx = onclick.find("clip_id=")
                            end_idx = onclick.find("'", start_idx)
                            if start_idx > 0 and end_idx > start_idx:
                                clip_id = onclick[start_idx + 8 : end_idx]
                                meeting_data["video"] = (
                                    f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={clip_id}"
                                )
                    else:
                        meeting_data["video"] = urljoin(BASE_URL, href)

            meetings.append(meeting_data)

    return meetings


async def get_meetings() -> List[Meeting]:
    """
    Fetch and parse meeting data from the Government Access Television website.

    Returns:
        A list of Meeting objects containing meeting data
    """
    async with aiohttp.ClientSession() as session:
        html = await fetch_page(BASE_URL, session)
        meeting_dicts = await parse_meetings(html)

        # Convert dictionaries to Meeting objects
        meetings = [Meeting(**meeting_dict) for meeting_dict in meeting_dicts]
        return meetings