Merge pull request #1603 from joe-pritchard/master

robbrad · web-flow · commit e64aa525d9b0 · 2025-09-21T06:18:15.000+01:00
fix: Update parsing for Northumberland council's new website
diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json
@@ -1776,13 +1776,13 @@
         "LAD24CD": "E06000065"
     },
     "NorthumberlandCouncil": {
-        "house_number": "22",
-        "postcode": "NE46 1UQ",
+        "uprn": "010096302588",
+        "postcode": "NE65 0ZP",
         "skip_get_url": true,
-        "url": "https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx",
+        "url": "https://bincollection.northumberland.gov.uk/postcode",
         "web_driver": "http://selenium:4444",
         "wiki_name": "Northumberland",
-        "wiki_note": "Pass the house number and postcode in their respective parameters. This parser requires a Selenium webdriver.",
+        "wiki_note": "Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyaddress.co.uk/search).",
         "LAD24CD": "E06000057"
     },
     "NorwichCityCouncil": {
@@ -2802,4 +2802,4 @@
         "wiki_note": "Provide your UPRN.",
         "LAD24CD": "E06000014"
     }
-}
+}
diff --git a/uk_bin_collection/uk_bin_collection/councils/NorthumberlandCouncil.py b/uk_bin_collection/uk_bin_collection/councils/NorthumberlandCouncil.py
@@ -1,17 +1,17 @@
 import time
+import datetime
 
+from datetime import datetime
 from bs4 import BeautifulSoup
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support.ui import Select, WebDriverWait
 
 from uk_bin_collection.uk_bin_collection.common import *
 from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
 
-# import the wonderful Beautiful Soup and the URL grabber
-
-
 class CouncilClass(AbstractGetBinDataClass):
     """
     Concrete classes have to implement all abstract operations of the
@@ -30,16 +30,18 @@ def extract_styles(self, style_str: str) -> dict:
     def parse_data(self, page: str, **kwargs) -> dict:
         driver = None
         try:
-            page = "https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx"
+            page = "https://bincollection.northumberland.gov.uk/postcode"
 
             data = {"bins": []}
 
-            user_paon = kwargs.get("paon")
             user_postcode = kwargs.get("postcode")
+            user_uprn = kwargs.get("uprn")
+
+            check_postcode(user_postcode)
+            check_uprn(user_uprn)
+
             web_driver = kwargs.get("web_driver")
             headless = kwargs.get("headless")
-            check_paon(user_paon)
-            check_postcode(user_postcode)
 
             # Create Selenium webdriver
             driver = create_webdriver(web_driver, headless, None, __name__)
@@ -50,105 +52,87 @@ def parse_data(self, page: str, **kwargs) -> dict:
 
             # Wait for and click cookie button
             cookie_button = wait.until(
-                EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
+                EC.element_to_be_clickable(
+                    (By.CLASS_NAME, "accept-all")
+                )
             )
             cookie_button.click()
 
-            # Wait for and find house number input
-            inputElement_hn = wait.until(
+            # Wait for and find postcode input
+            inputElement_pc = wait.until(
                 EC.presence_of_element_located(
-                    (
-                        By.ID,
-                        "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtHouse",
-                    )
+                    (By.ID, "postcode")
                 )
             )
 
-            # Wait for and find postcode input
-            inputElement_pc = wait.until(
+            # Enter postcode and submit
+            inputElement_pc.send_keys(user_postcode)
+            inputElement_pc.send_keys(Keys.ENTER)
+
+            # Wait for and find house number input
+            selectElement_address = wait.until(
                 EC.presence_of_element_located(
-                    (
-                        By.ID,
-                        "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtPostcode",
-                    )
+                    (By.ID, "address")
                 )
             )
 
-            # Enter details
-            inputElement_pc.send_keys(user_postcode)
-            inputElement_hn.send_keys(user_paon)
+            dropdown = Select(selectElement_address)
+            dropdown.select_by_value(user_uprn)
 
-            # Click lookup button and wait for results
-            lookup_button = wait.until(
+            # Click submit button and wait for results
+            submit_button = wait.until(
                 EC.element_to_be_clickable(
-                    (
-                        By.ID,
-                        "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_butLookup",
-                    )
+                    (By.CLASS_NAME, "govuk-button")
                 )
             )
-            lookup_button.click()
+            submit_button.click()
 
             # Wait for results to load
             route_summary = wait.until(
                 EC.presence_of_element_located(
-                    (
-                        By.ID,
-                        "p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
-                    )
+                    (By.CLASS_NAME, "govuk-table")
                 )
             )
 
+            now = datetime.now()
+            current_month = now.month
+            current_year = now.year
+
             # Get page source after everything has loaded
             soup = BeautifulSoup(driver.page_source, features="html.parser")
 
-            # Work out which bins can be collected for this address. Glass bins are only on some houses due to pilot programme.
-            bins_collected = list(
-                map(
-                    str.strip,
-                    soup.find(
-                        "span",
-                        id="p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
-                    )
-                    .text.replace("Routes found: ", "")
-                    .split(","),
+            # From the table, find all rows:
+            # - cell 1 is the date in format eg. 9 September (so no year value 🥲)
+            # - cell 2 is the day name, not useful
+            # - cell 3 is the bin type eg. "General waste", "Recycling", "Garden waste"
+            rows = soup.find("tbody", class_="govuk-table__body").find_all("tr", class_="govuk-table__row")
+
+            for row in rows:
+                bin_type=row.find_all("td")[-1].text.strip()
+
+                collection_date_string = row.find('th').text.strip()
+
+                # sometimes but not always the day is written "22nd" instead of 22 so make sure we get a proper int
+                collection_date_day = "".join([i for i in list(collection_date_string.split(" ")[0]) if i.isdigit()])
+                collection_date_month_name = collection_date_string.split(" ")[1]
+
+                # if we are currently in Oct, Nov, or Dec and the collection month is Jan, Feb, or Mar, let's assume its next year
+                if (current_month >= 10) and (collection_date_month_name in ["January", "February", "March"]):
+                    collection_date_year = current_year + 1
+                else:
+                    collection_date_year = current_year
+
+                collection_date = time.strptime(
+                    f"{collection_date_day} {collection_date_month_name} {collection_date_year}", "%d %B %Y"
                 )
-            )
 
-            # Get the background colour for each of them...
-            bins_by_colours = dict()
-            for bin in bins_collected:
-                if "(but no dates found)" in bin:
-                    continue
-                style_str = soup.find("span", string=bin)["style"]
-                bin_colour = self.extract_styles(style_str)["background-color"].upper()
-                bins_by_colours[bin_colour] = bin
-
-            # Work through the tables gathering the dates, if the cell has a background colour - match it to the bin type.
-            calander_tables = soup.find_all("table", title="Calendar")
-            for table in calander_tables:
-                # Get month and year
-                # First row in table is the header
-                rows = table.find_all("tr")
-                month_and_year = (
-                    rows[0].find("table", class_="calCtrlTitle").find("td").string
+                # Add it to the data
+                data["bins"].append(
+                    {
+                        "type": bin_type,
+                        "collectionDate": time.strftime(date_format, collection_date),
+                    }
                 )
-                bin_days = table.find_all("td", class_="calCtrlDay")
-                for day in bin_days:
-                    day_styles = self.extract_styles(day["style"])
-                    if "background-color" in day_styles:
-                        colour = day_styles["background-color"].upper()
-                        date = time.strptime(
-                            f"{day.string} {month_and_year}", "%d %B %Y"
-                        )
-
-                        # Add it to the data
-                        data["bins"].append(
-                            {
-                                "type": bins_by_colours[colour],
-                                "collectionDate": time.strftime(date_format, date),
-                            }
-                        )
         except Exception as e:
             # Here you can log the exception if needed
             print(f"An error occurred: {e}")
diff --git a/wiki/Councils.md b/wiki/Councils.md
@@ -2859,15 +2859,15 @@ Note: Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyadd
 
 ### Northumberland
 ```commandline
-python collect_data.py NorthumberlandCouncil https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx -s -p "XXXX XXX" -n XX -w http://HOST:PORT/
+python collect_data.py NorthumberlandCouncil https://bincollection.northumberland.gov.uk/postcode -s -u XXXXXXXX -p "XXXX XXX" -w http://HOST:PORT/
 ```
 Additional parameters:
 - `-s` - skip get URL
+- `-u` - UPRN
 - `-p` - postcode
-- `-n` - house number
 - `-w` - remote Selenium web driver URL (required for Home Assistant)
 
-Note: Pass the house number and postcode in their respective parameters. This parser requires a Selenium webdriver.
+Note: Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyaddress.co.uk/search).
 
 ---