|
1 | 1 | import re |
2 | 2 | from datetime import datetime |
| 3 | +from io import BytesIO |
3 | 4 |
|
4 | 5 | import requests |
5 | | -from bs4 import BeautifulSoup |
| 6 | +from pypdf import PdfReader |
6 | 7 | from waste_collection_schedule import Collection # type: ignore[attr-defined] |
7 | 8 |
|
8 | 9 | TITLE = "Redbridge Council" |
|
17 | 18 | "REFUSE": "mdi:trash-can", |
18 | 19 | "RECYCLING": "mdi:recycle", |
19 | 20 | "GARDEN": "mdi:leaf", |
| 21 | + "FOOD": "mdi:food-apple", |
20 | 22 | } |
21 | 23 |
|
| 24 | +KNOWN_SERVICES = {"REFUSE", "RECYCLING", "GARDEN", "FOOD"} |
| 25 | + |
| 26 | + |
| 27 | +def _extract_text_from_pdf(pdf_bytes: bytes) -> str: |
| 28 | + reader = PdfReader(BytesIO(pdf_bytes)) |
| 29 | + text = "" |
| 30 | + for page in reader.pages: |
| 31 | + text += page.extract_text() or "" |
| 32 | + return text |
| 33 | + |
| 34 | + |
| 35 | +def _extract_collections_from_text(text: str) -> list[Collection]: |
| 36 | + # Normalise and split into non‑empty trimmed lines |
| 37 | + lines = [line.strip() for line in text.splitlines() if line.strip()] |
| 38 | + |
| 39 | + # Headers and structure |
| 40 | + month_regex = re.compile( |
| 41 | + r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})$", |
| 42 | + re.I, |
| 43 | + ) |
| 44 | + weekday_header_regex = re.compile(r"^(Sun\s+Mon\s+Tue\s+Wed\s+Thu\s+Fri\s+Sat)$", re.I) |
| 45 | + |
| 46 | + # A day row contains one or more day numbers separated by spaces, e.g. "1 2" or "3 4 5 6 7" |
| 47 | + day_group_regex = re.compile(r"^(?:\d{1,2})(?:\s+\d{1,2})*$") |
| 48 | + |
| 49 | + # PDF lists only the type name per line, e.g. "Refuse", "Food", "Garden", "Recycling" |
| 50 | + service_regex = re.compile(r"^(.+)$") |
| 51 | + |
| 52 | + current_month_name: str | None = None |
| 53 | + current_year: int | None = None |
| 54 | + |
| 55 | + def month_number(name: str) -> int: |
| 56 | + return datetime.strptime(name, "%B").month |
| 57 | + |
| 58 | + entries: list[Collection] = [] |
| 59 | + i = 0 |
| 60 | + while i < len(lines): |
| 61 | + line = lines[i] |
| 62 | + |
| 63 | + # Detect month header, e.g. "March 2026" |
| 64 | + m = month_regex.match(line) |
| 65 | + if m: |
| 66 | + current_month_name = m.group(1) |
| 67 | + current_year = int(m.group(2)) |
| 68 | + i += 1 |
| 69 | + continue |
| 70 | + |
| 71 | + # Skip weekday header rows and other non‑data noise |
| 72 | + lower = line.lower() |
| 73 | + if ( |
| 74 | + weekday_header_regex.match(line) |
| 75 | + or lower.startswith("london borough of redbridge") |
| 76 | + or "your collection schedule" in lower |
| 77 | + ): |
| 78 | + i += 1 |
| 79 | + continue |
| 80 | + |
| 81 | + # Detect a calendar day ROW (e.g. "1 2" or "3 4 5 6 7") once we know the month/year |
| 82 | + if current_month_name and current_year and day_group_regex.match(line): |
| 83 | + # Parse all day numbers on the row |
| 84 | + days: list[int] = [] |
| 85 | + for token in line.split(): |
| 86 | + try: |
| 87 | + d = int(token) |
| 88 | + if 1 <= d <= 31: |
| 89 | + days.append(d) |
| 90 | + except ValueError: |
| 91 | + pass |
| 92 | + |
| 93 | + # Gather following service lines until next structural boundary |
| 94 | + services: list[str] = [] |
| 95 | + j = i + 1 |
| 96 | + while j < len(lines): |
| 97 | + next_line = lines[j] |
| 98 | + lower_next = next_line.lower() |
| 99 | + |
| 100 | + if ( |
| 101 | + month_regex.match(next_line) |
| 102 | + or weekday_header_regex.match(next_line) |
| 103 | + or day_group_regex.match(next_line) |
| 104 | + or "your collection schedule" in lower_next |
| 105 | + ): |
| 106 | + break |
| 107 | + |
| 108 | + s = service_regex.match(next_line) |
| 109 | + if s: |
| 110 | + wt = s.group(1).strip() |
| 111 | + key = wt.split(" ")[0].upper() |
| 112 | + if key in KNOWN_SERVICES: |
| 113 | + services.append(wt) |
| 114 | + j += 1 |
| 115 | + |
| 116 | + # For this layout, all services on the row belong to the last day number |
| 117 | + if days and services: |
| 118 | + month = month_number(current_month_name) |
| 119 | + target_day = max(days) |
| 120 | + date = datetime(current_year, month, target_day).date() |
| 121 | + for wt in services: |
| 122 | + key = wt.split(" ")[0].upper() |
| 123 | + entries.append(Collection(date=date, t=wt, icon=ICON_MAP.get(key))) |
| 124 | + |
| 125 | + i = j |
| 126 | + continue |
| 127 | + |
| 128 | + i += 1 |
| 129 | + |
| 130 | + return entries |
| 131 | + |
22 | 132 |
|
23 | 133 | class Source: |
24 | 134 | def __init__(self, uprn): |
25 | 135 | self._uprn = str(uprn) |
26 | 136 |
|
27 | 137 | def fetch(self): |
28 | 138 | r = requests.get( |
29 | | - "https://my.redbridge.gov.uk/RecycleRefuse", params={"uprn": self._uprn} |
| 139 | + "https://my.redbridge.gov.uk/RecycleRefuse/GetFile", |
| 140 | + params={"uprn": self._uprn}, |
30 | 141 | ) |
31 | 142 | r.raise_for_status() |
32 | 143 |
|
33 | | - soup = BeautifulSoup(r.text, "html.parser") |
34 | | - |
35 | | - services = soup.findAll("div", {"class": re.compile(".*CollectionDay")}) |
36 | | - |
37 | | - entries = [] |
38 | | - |
39 | | - for service in services: |
40 | | - waste_type = service.find("h3").text |
41 | | - |
42 | | - month_raw = service.find( |
43 | | - "div", {"class": re.compile(".*-collection-month")} |
44 | | - ) |
45 | | - day_raw = service.find( |
46 | | - "div", {"class": re.compile(".*-collection-day-numeric")} |
47 | | - ) |
48 | | - |
49 | | - if not month_raw or not day_raw: |
50 | | - # no collection date found for this service |
51 | | - continue |
52 | | - |
53 | | - # sanitize and extract day, month and optional year (e.g., 'January 2026') |
54 | | - day_match = re.search(r"(\d{1,2})", day_raw.text.strip()) |
55 | | - month_match = re.search( |
56 | | - r"([A-Za-z]+)(?:\s+(\d{4}))?", month_raw.text.strip() |
57 | | - ) |
58 | | - |
59 | | - if not day_match or not month_match: |
60 | | - # not a valid date format |
61 | | - raise ValueError( |
62 | | - f"Can't parse day/month from: day={day_raw.text!r}, month={month_raw.text!r}" |
63 | | - ) |
64 | | - |
65 | | - day = day_match.group(1) |
66 | | - month = month_match.group(1) |
67 | | - # sometimes the year is included in the month string |
68 | | - year_from_month = month_match.group(2) |
69 | | - |
70 | | - if year_from_month: |
71 | | - year = int(year_from_month) |
72 | | - else: |
73 | | - # if guessing the year, assume next year if month has already passed this year |
74 | | - year = ( |
75 | | - datetime.now().year + 1 |
76 | | - if datetime.strptime(month, "%B").month < datetime.now().month |
77 | | - else datetime.now().year |
78 | | - ) |
79 | | - |
80 | | - date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y") |
81 | | - |
82 | | - entries.append( |
83 | | - Collection( |
84 | | - date=date.date(), |
85 | | - t=waste_type, |
86 | | - icon=ICON_MAP.get(waste_type.split(" ")[0].upper()), |
87 | | - ) |
88 | | - ) |
89 | | - |
90 | | - return entries |
| 144 | + pdf_text = _extract_text_from_pdf(r.content) |
| 145 | + return _extract_collections_from_text(pdf_text) |
0 commit comments