Skip to content

Commit 8329b2e

Browse files
committed
fix: NorthHertfordshire selenium script
1 parent 5ac2711 commit 8329b2e

File tree

2 files changed

+260
-64
lines changed

2 files changed

+260
-64
lines changed

uk_bin_collection/tests/input.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1693,9 +1693,10 @@
16931693
"LAD24CD": "E06000012"
16941694
},
16951695
"NorthHertfordshireDistrictCouncil": {
1696-
"house_number": "2",
1696+
"house_number": "22",
16971697
"postcode": "SG6 4BJ",
16981698
"url": "https://www.north-herts.gov.uk",
1699+
"web_driver": "http://selenium:4444",
16991700
"wiki_name": "North Hertfordshire",
17001701
"wiki_note": "Pass the house number and postcode in their respective parameters.",
17011702
"LAD24CD": "E07000099"
@@ -2180,6 +2181,7 @@
21802181
},
21812182
"SouthRibbleCouncil": {
21822183
"uprn": "010013246384",
2184+
"postcode": "PR5 6DT",
21832185
"url": "https://www.southribble.gov.uk",
21842186
"wiki_command_url_override": "https://www.southribble.gov.uk",
21852187
"wiki_name": "South Ribble",
Lines changed: 257 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,93 +1,287 @@
1-
import requests
1+
# direct URL works, but includes a token, so I'm using Selenium
2+
# https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-show-details?webpage_token=c7c7c3cbc2f0478735fc746ca985b8f4221dea31c24dde99e39fb1c556b07788&auth=YTc5YTAwZmUyMGQ3&id=1421457
3+
4+
import re
5+
import time
6+
from datetime import datetime
7+
28
from bs4 import BeautifulSoup
9+
from dateutil.parser import parse
10+
from selenium.common.exceptions import NoSuchElementException, TimeoutException
11+
from selenium.webdriver.common.by import By
12+
from selenium.webdriver.common.keys import Keys
13+
from selenium.webdriver.support import expected_conditions as EC
14+
from selenium.webdriver.support.ui import Select
15+
from selenium.webdriver.support.wait import WebDriverWait
316

417
from uk_bin_collection.uk_bin_collection.common import *
518
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
619

720

8-
# import the wonderful Beautiful Soup and the URL grabber
921
class CouncilClass(AbstractGetBinDataClass):
10-
"""
11-
Concrete classes have to implement all abstract operations of the
12-
base class. They can also override some operations with a default
13-
implementation.
14-
"""
1522

1623
def parse_data(self, page: str, **kwargs) -> dict:
24+
driver = None
25+
try:
26+
data = {"bins": []}
27+
28+
user_paon = kwargs.get("paon")
29+
postcode = kwargs.get("postcode")
30+
web_driver = kwargs.get("web_driver")
31+
headless = kwargs.get("headless")
32+
url = "https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-input-address"
33+
34+
driver = create_webdriver(web_driver, headless, None, __name__)
35+
driver.get(url)
36+
37+
WebDriverWait(driver, 10).until(
38+
lambda d: d.execute_script("return document.readyState") == "complete"
39+
)
40+
41+
# Define the wait variable
42+
wait = WebDriverWait(
43+
driver, 20
44+
) # Create the wait object with a 20-second timeout
45+
46+
# Enter postcode - try different approaches for reliability
47+
# print("Looking for postcode input...")
48+
49+
postcode_input = wait.until(
50+
EC.element_to_be_clickable(
51+
(
52+
By.CSS_SELECTOR,
53+
"input.relation_path_type_ahead_search.form-control",
54+
)
55+
),
56+
message="Postcode input not found by class",
57+
)
58+
postcode_input.clear()
59+
postcode_input.send_keys(postcode)
60+
# print(f"Entered postcode: {postcode}")
61+
62+
# Wait for the dropdown to load
63+
# print("Waiting for address list to populate...")
64+
try:
65+
# Wait for the results to appear
66+
wait.until(
67+
EC.presence_of_element_located(
68+
(By.CSS_SELECTOR, ".relation_path_type_ahead_results_holder")
69+
),
70+
message="Address results container not found",
71+
)
72+
73+
# Wait for list items to appear
74+
wait.until(
75+
EC.presence_of_all_elements_located(
76+
(By.CSS_SELECTOR, ".relation_path_type_ahead_results_holder li")
77+
),
78+
message="No address items found in the list",
79+
)
80+
# print("Address list populated successfully")
81+
82+
# Search for user_paon in the address list using aria-label attribute
83+
try:
84+
# Use XPath to look for aria-label containing user_paon
85+
address_xpath = (
86+
f"//li[@aria-label and contains(@aria-label, '{user_paon}')]"
87+
)
88+
matching_address = wait.until(
89+
EC.element_to_be_clickable((By.XPATH, address_xpath)),
90+
message=f"No address containing '{user_paon}' found in aria-label attributes",
91+
)
92+
# print(f"Found matching address: {matching_address.get_attribute('aria-label')}")
93+
matching_address.click()
94+
# print("Clicked on matching address")
95+
96+
# Allow time for the selection to take effect
97+
time.sleep(2)
98+
99+
# Find and click the "Select address and continue" button
100+
continue_button = wait.until(
101+
EC.element_to_be_clickable(
102+
(
103+
By.CSS_SELECTOR,
104+
"input.btn.bg-green[value='Select address and continue']",
105+
)
106+
),
107+
message="Could not find 'Select address and continue' button",
108+
)
109+
# print("Found 'Select address and continue' button, clicking it...")
110+
continue_button.click()
111+
# print("Clicked on 'Select address and continue' button")
112+
113+
# Allow time for the page to load after clicking the button
114+
time.sleep(3)
115+
except TimeoutException as e:
116+
# print(f"Error finding address: {e}")
117+
raise
118+
except TimeoutException as e:
119+
# print(f"Error loading address list: {e}")
120+
raise
121+
122+
# After pressing Next button and waiting for page to load
123+
# print("Looking for schedule list...")
124+
125+
# Wait for the page to load - giving it extra time
126+
time.sleep(5)
127+
128+
# Use only the selector that we know works
129+
# print("Looking for bin type elements...")
130+
try:
131+
bin_type_selector = (
132+
By.CSS_SELECTOR,
133+
"div.formatting_bold.formatting_size_bigger.formatting span.value-as-text",
134+
)
135+
WebDriverWait(driver, 15).until(
136+
EC.presence_of_element_located(bin_type_selector)
137+
)
138+
# print(f"Found bin type elements with selector: {bin_type_selector}")
139+
except TimeoutException:
140+
# print("Could not find bin type elements. Taking screenshot for debugging...")
141+
screenshot_path = f"bin_type_error_{int(time.time())}.png"
142+
driver.save_screenshot(screenshot_path)
143+
# print(f"Screenshot saved to {screenshot_path}")
144+
145+
# Create BS4 object from driver's page source
146+
# print("Parsing page with BeautifulSoup...")
147+
soup = BeautifulSoup(driver.page_source, features="html.parser")
148+
149+
# Initialize data dictionary
150+
data = {"bins": []}
151+
152+
# Looking for bin types in the exact HTML structure
153+
bin_type_elements = soup.select(
154+
"div.formatting_bold.formatting_size_bigger.formatting span.value-as-text"
155+
)
156+
# print(f"Found {len(bin_type_elements)} bin type elements")
157+
158+
# Look specifically for date elements with the exact structure
159+
date_elements = soup.select("div.col-sm-12.font-xs-3xl span.value-as-text")
160+
hidden_dates = soup.select(
161+
"div.col-sm-12.font-xs-3xl input[type='hidden'][value*='/']"
162+
)
163+
164+
# print(f"Found {len(bin_type_elements)} bin types and {len(date_elements)} date elements")
165+
166+
# We need a smarter way to match bin types with their dates
167+
bin_count = 0
168+
169+
# Map of bin types to their collection dates
170+
bin_date_map = {}
171+
172+
# Extract all date strings that look like actual dates
173+
date_texts = []
174+
date_pattern = re.compile(
175+
r"(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+\d+(?:st|nd|rd|th)?\s+\w+\s+\d{4}",
176+
re.IGNORECASE,
177+
)
17178

18-
user_postcode = kwargs.get("postcode")
19-
user_paon = kwargs.get("paon")
20-
check_postcode(user_postcode)
21-
check_paon(user_paon)
22-
bindata = {"bins": []}
179+
for element in date_elements:
180+
text = element.get_text(strip=True)
181+
if date_pattern.search(text):
182+
date_texts.append(text)
183+
# print(f"Found valid date text: {text}")
23184

24-
URI = "https://uhtn-wrp.whitespacews.com/"
185+
# Find hidden date inputs with values in DD/MM/YYYY format
186+
hidden_date_values = []
187+
for hidden in hidden_dates:
188+
value = hidden.get("value", "")
189+
if re.match(r"\d{1,2}/\d{1,2}/\d{4}", value):
190+
hidden_date_values.append(value)
191+
# print(f"Found hidden date value: {value}")
25192

26-
session = requests.Session()
193+
# When filtering date elements
194+
date_elements = soup.select("div.col-sm-12.font-xs-3xl span.value-as-text")
195+
valid_date_elements = []
27196

28-
# get link from first page as has some kind of unique hash
29-
r = session.get(
30-
URI,
31-
)
32-
r.raise_for_status()
33-
soup = BeautifulSoup(r.text, features="html.parser")
197+
for element in date_elements:
198+
text = element.get_text(strip=True)
199+
if contains_date(text):
200+
valid_date_elements.append(element)
201+
# print(f"Found valid date element: {text}")
202+
else:
203+
pass
204+
# print(f"Skipping non-date element: {text}")
34205

35-
alink = soup.find("a", text="Find my bin collection day")
206+
# print(f"Found {len(bin_type_elements)} bin types and {len(valid_date_elements)} valid date elements")
36207

37-
if alink is None:
38-
raise Exception("Initial page did not load correctly")
208+
# When processing each bin type
209+
for i, bin_type_elem in enumerate(bin_type_elements):
210+
bin_type = bin_type_elem.get_text(strip=True)
39211

40-
# greplace 'seq' query string to skip next step
41-
nextpageurl = alink["href"].replace("seq=1", "seq=2")
212+
# Try to find a date for this bin type
213+
date_text = None
42214

43-
data = {
44-
"address_name_number": user_paon,
45-
"address_postcode": user_postcode,
46-
}
215+
# Look for a valid date element
216+
if i < len(valid_date_elements):
217+
date_elem = valid_date_elements[i]
218+
date_text = date_elem.get_text(strip=True)
47219

48-
# get list of addresses
49-
r = session.post(nextpageurl, data)
50-
r.raise_for_status()
220+
# If we don't have a valid date yet, try using the hidden input
221+
if not date_text or not contains_date(date_text):
222+
if i < len(hidden_dates):
223+
date_value = hidden_dates[i].get("value")
224+
if contains_date(date_value):
225+
date_text = date_value
51226

52-
soup = BeautifulSoup(r.text, features="html.parser")
227+
# Skip if we don't have a valid date
228+
if not date_text or not contains_date(date_text):
229+
# print(f"No valid date found for bin type: {bin_type}")
230+
continue
53231

54-
# get first address (if you don't enter enough argument values this won't find the right address)
55-
alink = soup.find("div", id="property_list").find("a")
232+
# print(f"Found bin type: {bin_type} with date: {date_text}")
56233

57-
if alink is None:
58-
raise Exception("Address not found")
234+
try:
235+
# Clean up the date text
236+
date_text = remove_ordinal_indicator_from_date_string(date_text)
59237

60-
nextpageurl = URI + alink["href"]
238+
# Try to parse the date
239+
try:
240+
collection_date = datetime.strptime(
241+
date_text, "%A %d %B %Y"
242+
).date()
243+
except ValueError:
244+
try:
245+
collection_date = datetime.strptime(
246+
date_text, "%d/%m/%Y"
247+
).date()
248+
except ValueError:
249+
# Last resort
250+
collection_date = parse(date_text).date()
61251

62-
# get collection page
63-
r = session.get(
64-
nextpageurl,
65-
)
66-
r.raise_for_status()
67-
soup = BeautifulSoup(r.text, features="html.parser")
252+
# Create bin entry
253+
bin_entry = {
254+
"type": bin_type,
255+
"collectionDate": collection_date.strftime(date_format),
256+
}
68257

69-
if soup.find("span", id="waste-hint"):
70-
raise Exception("No scheduled services at this address")
258+
# Add to data
259+
data["bins"].append(bin_entry)
260+
bin_count += 1
261+
# print(f"Added bin entry: {bin_entry}")
71262

72-
u1s = soup.find("section", id="scheduled-collections").find_all("u1")
263+
except Exception as e:
264+
pass
265+
# print(f"Error parsing date '{date_text}': {str(e)}")
73266

74-
for u1 in u1s:
75-
lis = u1.find_all("li", recursive=False)
267+
# print(f"Successfully parsed {bin_count} bin collections")
76268

77-
date = lis[1].text.replace("\n", "")
78-
bin_type = lis[2].text.replace("\n", "")
269+
if not data["bins"]:
270+
# print("No bin data found. Saving page for debugging...")
271+
with open(f"debug_page_{int(time.time())}.html", "w") as f:
272+
f.write(driver.page_source)
273+
driver.save_screenshot(f"final_error_screenshot_{int(time.time())}.png")
274+
raise ValueError(
275+
"No bin collection data could be extracted from the page"
276+
)
79277

80-
dict_data = {
81-
"type": bin_type,
82-
"collectionDate": datetime.strptime(
83-
date,
84-
"%d/%m/%Y",
85-
).strftime(date_format),
86-
}
87-
bindata["bins"].append(dict_data)
278+
# Sort the bin collections by date
279+
data["bins"].sort(
280+
key=lambda x: datetime.strptime(x.get("collectionDate"), date_format)
281+
)
88282

89-
bindata["bins"].sort(
90-
key=lambda x: datetime.strptime(x.get("collectionDate"), date_format)
91-
)
283+
return data
92284

93-
return bindata
285+
except Exception as e:
286+
# print(f"Error parsing bin collection data: {e}")
287+
raise

0 commit comments

Comments
 (0)