Skip to content

Commit 86aeb30

Browse files
authored
replace redbridge_gov_uk parser for new pdf format (#5608)
1 parent 7ad3b15 commit 86aeb30

File tree

2 files changed

+116
-61
lines changed

2 files changed

+116
-61
lines changed
Lines changed: 115 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import re
22
from datetime import datetime
3+
from io import BytesIO
34

45
import requests
5-
from bs4 import BeautifulSoup
6+
from pypdf import PdfReader
67
from waste_collection_schedule import Collection # type: ignore[attr-defined]
78

89
TITLE = "Redbridge Council"
@@ -17,74 +18,128 @@
1718
"REFUSE": "mdi:trash-can",
1819
"RECYCLING": "mdi:recycle",
1920
"GARDEN": "mdi:leaf",
21+
"FOOD": "mdi:food-apple",
2022
}
2123

24+
KNOWN_SERVICES = {"REFUSE", "RECYCLING", "GARDEN", "FOOD"}
25+
26+
27+
def _extract_text_from_pdf(pdf_bytes: bytes) -> str:
28+
reader = PdfReader(BytesIO(pdf_bytes))
29+
text = ""
30+
for page in reader.pages:
31+
text += page.extract_text() or ""
32+
return text
33+
34+
35+
def _extract_collections_from_text(text: str) -> list[Collection]:
36+
# Normalise and split into non‑empty trimmed lines
37+
lines = [line.strip() for line in text.splitlines() if line.strip()]
38+
39+
# Headers and structure
40+
month_regex = re.compile(
41+
r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})$",
42+
re.I,
43+
)
44+
weekday_header_regex = re.compile(r"^(Sun\s+Mon\s+Tue\s+Wed\s+Thu\s+Fri\s+Sat)$", re.I)
45+
46+
# A day row contains one or more day numbers separated by spaces, e.g. "1 2" or "3 4 5 6 7"
47+
day_group_regex = re.compile(r"^(?:\d{1,2})(?:\s+\d{1,2})*$")
48+
49+
# PDF lists only the type name per line, e.g. "Refuse", "Food", "Garden", "Recycling"
50+
service_regex = re.compile(r"^(.+)$")
51+
52+
current_month_name: str | None = None
53+
current_year: int | None = None
54+
55+
def month_number(name: str) -> int:
56+
return datetime.strptime(name, "%B").month
57+
58+
entries: list[Collection] = []
59+
i = 0
60+
while i < len(lines):
61+
line = lines[i]
62+
63+
# Detect month header, e.g. "March 2026"
64+
m = month_regex.match(line)
65+
if m:
66+
current_month_name = m.group(1)
67+
current_year = int(m.group(2))
68+
i += 1
69+
continue
70+
71+
# Skip weekday header rows and other non‑data noise
72+
lower = line.lower()
73+
if (
74+
weekday_header_regex.match(line)
75+
or lower.startswith("london borough of redbridge")
76+
or "your collection schedule" in lower
77+
):
78+
i += 1
79+
continue
80+
81+
# Detect a calendar day ROW (e.g. "1 2" or "3 4 5 6 7") once we know the month/year
82+
if current_month_name and current_year and day_group_regex.match(line):
83+
# Parse all day numbers on the row
84+
days: list[int] = []
85+
for token in line.split():
86+
try:
87+
d = int(token)
88+
if 1 <= d <= 31:
89+
days.append(d)
90+
except ValueError:
91+
pass
92+
93+
# Gather following service lines until next structural boundary
94+
services: list[str] = []
95+
j = i + 1
96+
while j < len(lines):
97+
next_line = lines[j]
98+
lower_next = next_line.lower()
99+
100+
if (
101+
month_regex.match(next_line)
102+
or weekday_header_regex.match(next_line)
103+
or day_group_regex.match(next_line)
104+
or "your collection schedule" in lower_next
105+
):
106+
break
107+
108+
s = service_regex.match(next_line)
109+
if s:
110+
wt = s.group(1).strip()
111+
key = wt.split(" ")[0].upper()
112+
if key in KNOWN_SERVICES:
113+
services.append(wt)
114+
j += 1
115+
116+
# For this layout, all services on the row belong to the last day number
117+
if days and services:
118+
month = month_number(current_month_name)
119+
target_day = max(days)
120+
date = datetime(current_year, month, target_day).date()
121+
for wt in services:
122+
key = wt.split(" ")[0].upper()
123+
entries.append(Collection(date=date, t=wt, icon=ICON_MAP.get(key)))
124+
125+
i = j
126+
continue
127+
128+
i += 1
129+
130+
return entries
131+
22132

23133
class Source:
24134
def __init__(self, uprn):
25135
self._uprn = str(uprn)
26136

27137
def fetch(self):
28138
r = requests.get(
29-
"https://my.redbridge.gov.uk/RecycleRefuse", params={"uprn": self._uprn}
139+
"https://my.redbridge.gov.uk/RecycleRefuse/GetFile",
140+
params={"uprn": self._uprn},
30141
)
31142
r.raise_for_status()
32143

33-
soup = BeautifulSoup(r.text, "html.parser")
34-
35-
services = soup.findAll("div", {"class": re.compile(".*CollectionDay")})
36-
37-
entries = []
38-
39-
for service in services:
40-
waste_type = service.find("h3").text
41-
42-
month_raw = service.find(
43-
"div", {"class": re.compile(".*-collection-month")}
44-
)
45-
day_raw = service.find(
46-
"div", {"class": re.compile(".*-collection-day-numeric")}
47-
)
48-
49-
if not month_raw or not day_raw:
50-
# no collection date found for this service
51-
continue
52-
53-
# sanitize and extract day, month and optional year (e.g., 'January 2026')
54-
day_match = re.search(r"(\d{1,2})", day_raw.text.strip())
55-
month_match = re.search(
56-
r"([A-Za-z]+)(?:\s+(\d{4}))?", month_raw.text.strip()
57-
)
58-
59-
if not day_match or not month_match:
60-
# not a valid date format
61-
raise ValueError(
62-
f"Can't parse day/month from: day={day_raw.text!r}, month={month_raw.text!r}"
63-
)
64-
65-
day = day_match.group(1)
66-
month = month_match.group(1)
67-
# sometimes the year is included in the month string
68-
year_from_month = month_match.group(2)
69-
70-
if year_from_month:
71-
year = int(year_from_month)
72-
else:
73-
# if guessing the year, assume next year if month has already passed this year
74-
year = (
75-
datetime.now().year + 1
76-
if datetime.strptime(month, "%B").month < datetime.now().month
77-
else datetime.now().year
78-
)
79-
80-
date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
81-
82-
entries.append(
83-
Collection(
84-
date=date.date(),
85-
t=waste_type,
86-
icon=ICON_MAP.get(waste_type.split(" ")[0].upper()),
87-
)
88-
)
89-
90-
return entries
144+
pdf_text = _extract_text_from_pdf(r.content)
145+
return _extract_collections_from_text(pdf_text)

doc/source/redbridge_gov_uk.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ waste_collection_schedule:
3131
#### How to find your `UPRN`
3232
Your uprn is the collection of numbers at the end of the url when downloading a collection calendar for your collection schedule on the Redbridge web site.
3333

34-
For example: _https://my.redbridge.gov.uk/RecycleRefuse/GetFile?uprn=`10034922090`_
34+
For example: _https://my.redbridge.gov.uk/RecycleRefuse/GetFile?uprn=10034922090_
3535

3636
Alternatively, you can discover your Unique Property Reference Number (UPRN) is by going to https://www.findmyaddress.co.uk/ and entering in your address details.
3737

0 commit comments

Comments
 (0)