|
1 |
| -import requests |
| 1 | +# direct URL works, but includes a token, so I'm using Selenium |
| 2 | +# https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-show-details?webpage_token=c7c7c3cbc2f0478735fc746ca985b8f4221dea31c24dde99e39fb1c556b07788&auth=YTc5YTAwZmUyMGQ3&id=1421457 |
| 3 | + |
| 4 | +import re |
| 5 | +import time |
| 6 | +from datetime import datetime |
| 7 | + |
2 | 8 | from bs4 import BeautifulSoup
|
| 9 | +from dateutil.parser import parse |
| 10 | +from selenium.common.exceptions import NoSuchElementException, TimeoutException |
| 11 | +from selenium.webdriver.common.by import By |
| 12 | +from selenium.webdriver.common.keys import Keys |
| 13 | +from selenium.webdriver.support import expected_conditions as EC |
| 14 | +from selenium.webdriver.support.ui import Select |
| 15 | +from selenium.webdriver.support.wait import WebDriverWait |
3 | 16 |
|
4 | 17 | from uk_bin_collection.uk_bin_collection.common import *
|
5 | 18 | from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
|
6 | 19 |
|
7 | 20 |
|
8 |
| -# import the wonderful Beautiful Soup and the URL grabber |
9 | 21 | class CouncilClass(AbstractGetBinDataClass):
|
10 |
| - """ |
11 |
| - Concrete classes have to implement all abstract operations of the |
12 |
| - base class. They can also override some operations with a default |
13 |
| - implementation. |
14 |
| - """ |
15 | 22 |
|
16 | 23 | def parse_data(self, page: str, **kwargs) -> dict:
|
| 24 | + driver = None |
| 25 | + try: |
| 26 | + data = {"bins": []} |
| 27 | + |
| 28 | + user_paon = kwargs.get("paon") |
| 29 | + postcode = kwargs.get("postcode") |
| 30 | + web_driver = kwargs.get("web_driver") |
| 31 | + headless = kwargs.get("headless") |
| 32 | + url = "https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-input-address" |
| 33 | + |
| 34 | + driver = create_webdriver(web_driver, headless, None, __name__) |
| 35 | + driver.get(url) |
| 36 | + |
| 37 | + WebDriverWait(driver, 10).until( |
| 38 | + lambda d: d.execute_script("return document.readyState") == "complete" |
| 39 | + ) |
| 40 | + |
| 41 | + # Define the wait variable |
| 42 | + wait = WebDriverWait( |
| 43 | + driver, 20 |
| 44 | + ) # Create the wait object with a 20-second timeout |
| 45 | + |
| 46 | + # Enter postcode - try different approaches for reliability |
| 47 | + # print("Looking for postcode input...") |
| 48 | + |
| 49 | + postcode_input = wait.until( |
| 50 | + EC.element_to_be_clickable( |
| 51 | + ( |
| 52 | + By.CSS_SELECTOR, |
| 53 | + "input.relation_path_type_ahead_search.form-control", |
| 54 | + ) |
| 55 | + ), |
| 56 | + message="Postcode input not found by class", |
| 57 | + ) |
| 58 | + postcode_input.clear() |
| 59 | + postcode_input.send_keys(postcode) |
| 60 | + # print(f"Entered postcode: {postcode}") |
| 61 | + |
| 62 | + # Wait for the dropdown to load |
| 63 | + # print("Waiting for address list to populate...") |
| 64 | + try: |
| 65 | + # Wait for the results to appear |
| 66 | + wait.until( |
| 67 | + EC.presence_of_element_located( |
| 68 | + (By.CSS_SELECTOR, ".relation_path_type_ahead_results_holder") |
| 69 | + ), |
| 70 | + message="Address results container not found", |
| 71 | + ) |
| 72 | + |
| 73 | + # Wait for list items to appear |
| 74 | + wait.until( |
| 75 | + EC.presence_of_all_elements_located( |
| 76 | + (By.CSS_SELECTOR, ".relation_path_type_ahead_results_holder li") |
| 77 | + ), |
| 78 | + message="No address items found in the list", |
| 79 | + ) |
| 80 | + # print("Address list populated successfully") |
| 81 | + |
| 82 | + # Search for user_paon in the address list using aria-label attribute |
| 83 | + try: |
| 84 | + # Use XPath to look for aria-label containing user_paon |
| 85 | + address_xpath = ( |
| 86 | + f"//li[@aria-label and contains(@aria-label, '{user_paon}')]" |
| 87 | + ) |
| 88 | + matching_address = wait.until( |
| 89 | + EC.element_to_be_clickable((By.XPATH, address_xpath)), |
| 90 | + message=f"No address containing '{user_paon}' found in aria-label attributes", |
| 91 | + ) |
| 92 | + # print(f"Found matching address: {matching_address.get_attribute('aria-label')}") |
| 93 | + matching_address.click() |
| 94 | + # print("Clicked on matching address") |
| 95 | + |
| 96 | + # Allow time for the selection to take effect |
| 97 | + time.sleep(2) |
| 98 | + |
| 99 | + # Find and click the "Select address and continue" button |
| 100 | + continue_button = wait.until( |
| 101 | + EC.element_to_be_clickable( |
| 102 | + ( |
| 103 | + By.CSS_SELECTOR, |
| 104 | + "input.btn.bg-green[value='Select address and continue']", |
| 105 | + ) |
| 106 | + ), |
| 107 | + message="Could not find 'Select address and continue' button", |
| 108 | + ) |
| 109 | + # print("Found 'Select address and continue' button, clicking it...") |
| 110 | + continue_button.click() |
| 111 | + # print("Clicked on 'Select address and continue' button") |
| 112 | + |
| 113 | + # Allow time for the page to load after clicking the button |
| 114 | + time.sleep(3) |
| 115 | + except TimeoutException as e: |
| 116 | + # print(f"Error finding address: {e}") |
| 117 | + raise |
| 118 | + except TimeoutException as e: |
| 119 | + # print(f"Error loading address list: {e}") |
| 120 | + raise |
| 121 | + |
| 122 | + # After pressing Next button and waiting for page to load |
| 123 | + # print("Looking for schedule list...") |
| 124 | + |
| 125 | + # Wait for the page to load - giving it extra time |
| 126 | + time.sleep(5) |
| 127 | + |
| 128 | + # Use only the selector that we know works |
| 129 | + # print("Looking for bin type elements...") |
| 130 | + try: |
| 131 | + bin_type_selector = ( |
| 132 | + By.CSS_SELECTOR, |
| 133 | + "div.formatting_bold.formatting_size_bigger.formatting span.value-as-text", |
| 134 | + ) |
| 135 | + WebDriverWait(driver, 15).until( |
| 136 | + EC.presence_of_element_located(bin_type_selector) |
| 137 | + ) |
| 138 | + # print(f"Found bin type elements with selector: {bin_type_selector}") |
| 139 | + except TimeoutException: |
| 140 | + # print("Could not find bin type elements. Taking screenshot for debugging...") |
| 141 | + screenshot_path = f"bin_type_error_{int(time.time())}.png" |
| 142 | + driver.save_screenshot(screenshot_path) |
| 143 | + # print(f"Screenshot saved to {screenshot_path}") |
| 144 | + |
| 145 | + # Create BS4 object from driver's page source |
| 146 | + # print("Parsing page with BeautifulSoup...") |
| 147 | + soup = BeautifulSoup(driver.page_source, features="html.parser") |
| 148 | + |
| 149 | + # Initialize data dictionary |
| 150 | + data = {"bins": []} |
| 151 | + |
| 152 | + # Looking for bin types in the exact HTML structure |
| 153 | + bin_type_elements = soup.select( |
| 154 | + "div.formatting_bold.formatting_size_bigger.formatting span.value-as-text" |
| 155 | + ) |
| 156 | + # print(f"Found {len(bin_type_elements)} bin type elements") |
| 157 | + |
| 158 | + # Look specifically for date elements with the exact structure |
| 159 | + date_elements = soup.select("div.col-sm-12.font-xs-3xl span.value-as-text") |
| 160 | + hidden_dates = soup.select( |
| 161 | + "div.col-sm-12.font-xs-3xl input[type='hidden'][value*='/']" |
| 162 | + ) |
| 163 | + |
| 164 | + # print(f"Found {len(bin_type_elements)} bin types and {len(date_elements)} date elements") |
| 165 | + |
| 166 | + # We need a smarter way to match bin types with their dates |
| 167 | + bin_count = 0 |
| 168 | + |
| 169 | + # Map of bin types to their collection dates |
| 170 | + bin_date_map = {} |
| 171 | + |
| 172 | + # Extract all date strings that look like actual dates |
| 173 | + date_texts = [] |
| 174 | + date_pattern = re.compile( |
| 175 | + r"(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+\d+(?:st|nd|rd|th)?\s+\w+\s+\d{4}", |
| 176 | + re.IGNORECASE, |
| 177 | + ) |
17 | 178 |
|
18 |
| - user_postcode = kwargs.get("postcode") |
19 |
| - user_paon = kwargs.get("paon") |
20 |
| - check_postcode(user_postcode) |
21 |
| - check_paon(user_paon) |
22 |
| - bindata = {"bins": []} |
| 179 | + for element in date_elements: |
| 180 | + text = element.get_text(strip=True) |
| 181 | + if date_pattern.search(text): |
| 182 | + date_texts.append(text) |
| 183 | + # print(f"Found valid date text: {text}") |
23 | 184 |
|
24 |
| - URI = "https://uhtn-wrp.whitespacews.com/" |
| 185 | + # Find hidden date inputs with values in DD/MM/YYYY format |
| 186 | + hidden_date_values = [] |
| 187 | + for hidden in hidden_dates: |
| 188 | + value = hidden.get("value", "") |
| 189 | + if re.match(r"\d{1,2}/\d{1,2}/\d{4}", value): |
| 190 | + hidden_date_values.append(value) |
| 191 | + # print(f"Found hidden date value: {value}") |
25 | 192 |
|
26 |
| - session = requests.Session() |
| 193 | + # When filtering date elements |
| 194 | + date_elements = soup.select("div.col-sm-12.font-xs-3xl span.value-as-text") |
| 195 | + valid_date_elements = [] |
27 | 196 |
|
28 |
| - # get link from first page as has some kind of unique hash |
29 |
| - r = session.get( |
30 |
| - URI, |
31 |
| - ) |
32 |
| - r.raise_for_status() |
33 |
| - soup = BeautifulSoup(r.text, features="html.parser") |
| 197 | + for element in date_elements: |
| 198 | + text = element.get_text(strip=True) |
| 199 | + if contains_date(text): |
| 200 | + valid_date_elements.append(element) |
| 201 | + # print(f"Found valid date element: {text}") |
| 202 | + else: |
| 203 | + pass |
| 204 | + # print(f"Skipping non-date element: {text}") |
34 | 205 |
|
35 |
| - alink = soup.find("a", text="Find my bin collection day") |
| 206 | + # print(f"Found {len(bin_type_elements)} bin types and {len(valid_date_elements)} valid date elements") |
36 | 207 |
|
37 |
| - if alink is None: |
38 |
| - raise Exception("Initial page did not load correctly") |
| 208 | + # When processing each bin type |
| 209 | + for i, bin_type_elem in enumerate(bin_type_elements): |
| 210 | + bin_type = bin_type_elem.get_text(strip=True) |
39 | 211 |
|
40 |
| - # greplace 'seq' query string to skip next step |
41 |
| - nextpageurl = alink["href"].replace("seq=1", "seq=2") |
| 212 | + # Try to find a date for this bin type |
| 213 | + date_text = None |
42 | 214 |
|
43 |
| - data = { |
44 |
| - "address_name_number": user_paon, |
45 |
| - "address_postcode": user_postcode, |
46 |
| - } |
| 215 | + # Look for a valid date element |
| 216 | + if i < len(valid_date_elements): |
| 217 | + date_elem = valid_date_elements[i] |
| 218 | + date_text = date_elem.get_text(strip=True) |
47 | 219 |
|
48 |
| - # get list of addresses |
49 |
| - r = session.post(nextpageurl, data) |
50 |
| - r.raise_for_status() |
| 220 | + # If we don't have a valid date yet, try using the hidden input |
| 221 | + if not date_text or not contains_date(date_text): |
| 222 | + if i < len(hidden_dates): |
| 223 | + date_value = hidden_dates[i].get("value") |
| 224 | + if contains_date(date_value): |
| 225 | + date_text = date_value |
51 | 226 |
|
52 |
| - soup = BeautifulSoup(r.text, features="html.parser") |
| 227 | + # Skip if we don't have a valid date |
| 228 | + if not date_text or not contains_date(date_text): |
| 229 | + # print(f"No valid date found for bin type: {bin_type}") |
| 230 | + continue |
53 | 231 |
|
54 |
| - # get first address (if you don't enter enough argument values this won't find the right address) |
55 |
| - alink = soup.find("div", id="property_list").find("a") |
| 232 | + # print(f"Found bin type: {bin_type} with date: {date_text}") |
56 | 233 |
|
57 |
| - if alink is None: |
58 |
| - raise Exception("Address not found") |
| 234 | + try: |
| 235 | + # Clean up the date text |
| 236 | + date_text = remove_ordinal_indicator_from_date_string(date_text) |
59 | 237 |
|
60 |
| - nextpageurl = URI + alink["href"] |
| 238 | + # Try to parse the date |
| 239 | + try: |
| 240 | + collection_date = datetime.strptime( |
| 241 | + date_text, "%A %d %B %Y" |
| 242 | + ).date() |
| 243 | + except ValueError: |
| 244 | + try: |
| 245 | + collection_date = datetime.strptime( |
| 246 | + date_text, "%d/%m/%Y" |
| 247 | + ).date() |
| 248 | + except ValueError: |
| 249 | + # Last resort |
| 250 | + collection_date = parse(date_text).date() |
61 | 251 |
|
62 |
| - # get collection page |
63 |
| - r = session.get( |
64 |
| - nextpageurl, |
65 |
| - ) |
66 |
| - r.raise_for_status() |
67 |
| - soup = BeautifulSoup(r.text, features="html.parser") |
| 252 | + # Create bin entry |
| 253 | + bin_entry = { |
| 254 | + "type": bin_type, |
| 255 | + "collectionDate": collection_date.strftime(date_format), |
| 256 | + } |
68 | 257 |
|
69 |
| - if soup.find("span", id="waste-hint"): |
70 |
| - raise Exception("No scheduled services at this address") |
| 258 | + # Add to data |
| 259 | + data["bins"].append(bin_entry) |
| 260 | + bin_count += 1 |
| 261 | + # print(f"Added bin entry: {bin_entry}") |
71 | 262 |
|
72 |
| - u1s = soup.find("section", id="scheduled-collections").find_all("u1") |
| 263 | + except Exception as e: |
| 264 | + pass |
| 265 | + # print(f"Error parsing date '{date_text}': {str(e)}") |
73 | 266 |
|
74 |
| - for u1 in u1s: |
75 |
| - lis = u1.find_all("li", recursive=False) |
| 267 | + # print(f"Successfully parsed {bin_count} bin collections") |
76 | 268 |
|
77 |
| - date = lis[1].text.replace("\n", "") |
78 |
| - bin_type = lis[2].text.replace("\n", "") |
| 269 | + if not data["bins"]: |
| 270 | + # print("No bin data found. Saving page for debugging...") |
| 271 | + with open(f"debug_page_{int(time.time())}.html", "w") as f: |
| 272 | + f.write(driver.page_source) |
| 273 | + driver.save_screenshot(f"final_error_screenshot_{int(time.time())}.png") |
| 274 | + raise ValueError( |
| 275 | + "No bin collection data could be extracted from the page" |
| 276 | + ) |
79 | 277 |
|
80 |
| - dict_data = { |
81 |
| - "type": bin_type, |
82 |
| - "collectionDate": datetime.strptime( |
83 |
| - date, |
84 |
| - "%d/%m/%Y", |
85 |
| - ).strftime(date_format), |
86 |
| - } |
87 |
| - bindata["bins"].append(dict_data) |
| 278 | + # Sort the bin collections by date |
| 279 | + data["bins"].sort( |
| 280 | + key=lambda x: datetime.strptime(x.get("collectionDate"), date_format) |
| 281 | + ) |
88 | 282 |
|
89 |
| - bindata["bins"].sort( |
90 |
| - key=lambda x: datetime.strptime(x.get("collectionDate"), date_format) |
91 |
| - ) |
| 283 | + return data |
92 | 284 |
|
93 |
| - return bindata |
| 285 | + except Exception as e: |
| 286 | + # print(f"Error parsing bin collection data: {e}") |
| 287 | + raise |
0 commit comments