Skip to content

Commit e64aa52

Browse files
authored
Merge pull request #1603 from joe-pritchard/master
fix: Update parsing for Northumberland council's new website
2 parents 94c255a + f73353b commit e64aa52

File tree

3 files changed

+71
-87
lines changed

3 files changed

+71
-87
lines changed

uk_bin_collection/tests/input.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,13 +1776,13 @@
17761776
"LAD24CD": "E06000065"
17771777
},
17781778
"NorthumberlandCouncil": {
1779-
"house_number": "22",
1780-
"postcode": "NE46 1UQ",
1779+
"uprn": "010096302588",
1780+
"postcode": "NE65 0ZP",
17811781
"skip_get_url": true,
1782-
"url": "https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx",
1782+
"url": "https://bincollection.northumberland.gov.uk/postcode",
17831783
"web_driver": "http://selenium:4444",
17841784
"wiki_name": "Northumberland",
1785-
"wiki_note": "Pass the house number and postcode in their respective parameters. This parser requires a Selenium webdriver.",
1785+
"wiki_note": "Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyaddress.co.uk/search).",
17861786
"LAD24CD": "E06000057"
17871787
},
17881788
"NorwichCityCouncil": {
@@ -2802,4 +2802,4 @@
28022802
"wiki_note": "Provide your UPRN.",
28032803
"LAD24CD": "E06000014"
28042804
}
2805-
}
2805+
}

uk_bin_collection/uk_bin_collection/councils/NorthumberlandCouncil.py

Lines changed: 63 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
import time
2+
import datetime
23

4+
from datetime import datetime
35
from bs4 import BeautifulSoup
46
from selenium.common.exceptions import TimeoutException
57
from selenium.webdriver.common.by import By
8+
from selenium.webdriver.common.keys import Keys
69
from selenium.webdriver.support import expected_conditions as EC
7-
from selenium.webdriver.support.ui import WebDriverWait
10+
from selenium.webdriver.support.ui import Select, WebDriverWait
811

912
from uk_bin_collection.uk_bin_collection.common import *
1013
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
1114

12-
# import the wonderful Beautiful Soup and the URL grabber
13-
14-
1515
class CouncilClass(AbstractGetBinDataClass):
1616
"""
1717
Concrete classes have to implement all abstract operations of the
@@ -30,16 +30,18 @@ def extract_styles(self, style_str: str) -> dict:
3030
def parse_data(self, page: str, **kwargs) -> dict:
3131
driver = None
3232
try:
33-
page = "https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx"
33+
page = "https://bincollection.northumberland.gov.uk/postcode"
3434

3535
data = {"bins": []}
3636

37-
user_paon = kwargs.get("paon")
3837
user_postcode = kwargs.get("postcode")
38+
user_uprn = kwargs.get("uprn")
39+
40+
check_postcode(user_postcode)
41+
check_uprn(user_uprn)
42+
3943
web_driver = kwargs.get("web_driver")
4044
headless = kwargs.get("headless")
41-
check_paon(user_paon)
42-
check_postcode(user_postcode)
4345

4446
# Create Selenium webdriver
4547
driver = create_webdriver(web_driver, headless, None, __name__)
@@ -50,105 +52,87 @@ def parse_data(self, page: str, **kwargs) -> dict:
5052

5153
# Wait for and click cookie button
5254
cookie_button = wait.until(
53-
EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
55+
EC.element_to_be_clickable(
56+
(By.CLASS_NAME, "accept-all")
57+
)
5458
)
5559
cookie_button.click()
5660

57-
# Wait for and find house number input
58-
inputElement_hn = wait.until(
61+
# Wait for and find postcode input
62+
inputElement_pc = wait.until(
5963
EC.presence_of_element_located(
60-
(
61-
By.ID,
62-
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtHouse",
63-
)
64+
(By.ID, "postcode")
6465
)
6566
)
6667

67-
# Wait for and find postcode input
68-
inputElement_pc = wait.until(
68+
# Enter postcode and submit
69+
inputElement_pc.send_keys(user_postcode)
70+
inputElement_pc.send_keys(Keys.ENTER)
71+
72+
# Wait for and find house number input
73+
selectElement_address = wait.until(
6974
EC.presence_of_element_located(
70-
(
71-
By.ID,
72-
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtPostcode",
73-
)
75+
(By.ID, "address")
7476
)
7577
)
7678

77-
# Enter details
78-
inputElement_pc.send_keys(user_postcode)
79-
inputElement_hn.send_keys(user_paon)
79+
dropdown = Select(selectElement_address)
80+
dropdown.select_by_value(user_uprn)
8081

81-
# Click lookup button and wait for results
82-
lookup_button = wait.until(
82+
# Click submit button and wait for results
83+
submit_button = wait.until(
8384
EC.element_to_be_clickable(
84-
(
85-
By.ID,
86-
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_butLookup",
87-
)
85+
(By.CLASS_NAME, "govuk-button")
8886
)
8987
)
90-
lookup_button.click()
88+
submit_button.click()
9189

9290
# Wait for results to load
9391
route_summary = wait.until(
9492
EC.presence_of_element_located(
95-
(
96-
By.ID,
97-
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
98-
)
93+
(By.CLASS_NAME, "govuk-table")
9994
)
10095
)
10196

97+
now = datetime.now()
98+
current_month = now.month
99+
current_year = now.year
100+
102101
# Get page source after everything has loaded
103102
soup = BeautifulSoup(driver.page_source, features="html.parser")
104103

105-
# Work out which bins can be collected for this address. Glass bins are only on some houses due to pilot programme.
106-
bins_collected = list(
107-
map(
108-
str.strip,
109-
soup.find(
110-
"span",
111-
id="p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
112-
)
113-
.text.replace("Routes found: ", "")
114-
.split(","),
104+
# From the table, find all rows:
105+
# - cell 1 is the date in format eg. 9 September (so no year value 🥲)
106+
# - cell 2 is the day name, not useful
107+
# - cell 3 is the bin type eg. "General waste", "Recycling", "Garden waste"
108+
rows = soup.find("tbody", class_="govuk-table__body").find_all("tr", class_="govuk-table__row")
109+
110+
for row in rows:
111+
bin_type=row.find_all("td")[-1].text.strip()
112+
113+
collection_date_string = row.find('th').text.strip()
114+
115+
# sometimes but not always the day is written "22nd" instead of 22 so make sure we get a proper int
116+
collection_date_day = "".join([i for i in list(collection_date_string.split(" ")[0]) if i.isdigit()])
117+
collection_date_month_name = collection_date_string.split(" ")[1]
118+
119+
# if we are currently in Oct, Nov, or Dec and the collection month is Jan, Feb, or Mar, let's assume its next year
120+
if (current_month >= 10) and (collection_date_month_name in ["January", "February", "March"]):
121+
collection_date_year = current_year + 1
122+
else:
123+
collection_date_year = current_year
124+
125+
collection_date = time.strptime(
126+
f"{collection_date_day} {collection_date_month_name} {collection_date_year}", "%d %B %Y"
115127
)
116-
)
117128

118-
# Get the background colour for each of them...
119-
bins_by_colours = dict()
120-
for bin in bins_collected:
121-
if "(but no dates found)" in bin:
122-
continue
123-
style_str = soup.find("span", string=bin)["style"]
124-
bin_colour = self.extract_styles(style_str)["background-color"].upper()
125-
bins_by_colours[bin_colour] = bin
126-
127-
# Work through the tables gathering the dates, if the cell has a background colour - match it to the bin type.
128-
calander_tables = soup.find_all("table", title="Calendar")
129-
for table in calander_tables:
130-
# Get month and year
131-
# First row in table is the header
132-
rows = table.find_all("tr")
133-
month_and_year = (
134-
rows[0].find("table", class_="calCtrlTitle").find("td").string
129+
# Add it to the data
130+
data["bins"].append(
131+
{
132+
"type": bin_type,
133+
"collectionDate": time.strftime(date_format, collection_date),
134+
}
135135
)
136-
bin_days = table.find_all("td", class_="calCtrlDay")
137-
for day in bin_days:
138-
day_styles = self.extract_styles(day["style"])
139-
if "background-color" in day_styles:
140-
colour = day_styles["background-color"].upper()
141-
date = time.strptime(
142-
f"{day.string} {month_and_year}", "%d %B %Y"
143-
)
144-
145-
# Add it to the data
146-
data["bins"].append(
147-
{
148-
"type": bins_by_colours[colour],
149-
"collectionDate": time.strftime(date_format, date),
150-
}
151-
)
152136
except Exception as e:
153137
# Here you can log the exception if needed
154138
print(f"An error occurred: {e}")

wiki/Councils.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2859,15 +2859,15 @@ Note: Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyadd
28592859

28602860
### Northumberland
28612861
```commandline
2862-
python collect_data.py NorthumberlandCouncil https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx -s -p "XXXX XXX" -n XX -w http://HOST:PORT/
2862+
python collect_data.py NorthumberlandCouncil https://bincollection.northumberland.gov.uk/postcode -s -u XXXXXXXX -p "XXXX XXX" -w http://HOST:PORT/
28632863
```
28642864
Additional parameters:
28652865
- `-s` - skip get URL
2866+
- `-u` - UPRN
28662867
- `-p` - postcode
2867-
- `-n` - house number
28682868
- `-w` - remote Selenium web driver URL (required for Home Assistant)
28692869

2870-
Note: Pass the house number and postcode in their respective parameters. This parser requires a Selenium webdriver.
2870+
Note: Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyaddress.co.uk/search).
28712871

28722872
---
28732873

0 commit comments

Comments
 (0)