Skip to content

Commit d14fdc0

Browse files
committed
fix: Cotswald and coventry
1 parent c6dbb36 commit d14fdc0

File tree

4 files changed

+199
-71
lines changed

4 files changed

+199
-71
lines changed

uk_bin_collection/tests/input.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -580,7 +580,7 @@
580580
"LAD24CD": "E06000052"
581581
},
582582
"CotswoldDistrictCouncil": {
583-
"house_number": "19",
583+
"house_number": "19 SUMMERS WAY, MORETON-IN-MARSH, GL56 0GB",
584584
"postcode": "GL56 0GB",
585585
"skip_get_url": true,
586586
"url": "https://community.cotswold.gov.uk/s/waste-collection-enquiry",
Lines changed: 191 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import time
2-
from datetime import datetime
2+
import re
3+
from datetime import datetime, timedelta
34

45
from bs4 import BeautifulSoup
56
from selenium.webdriver.common.by import By
@@ -11,8 +12,6 @@
1112
from uk_bin_collection.uk_bin_collection.common import *
1213
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
1314

14-
# import the wonderful Beautiful Soup and the URL grabber
15-
1615

1716
class CouncilClass(AbstractGetBinDataClass):
1817
"""
@@ -30,89 +29,214 @@ def parse_data(self, page: str, **kwargs) -> dict:
3029

3130
house_number = kwargs.get("paon")
3231
postcode = kwargs.get("postcode")
33-
full_address = f"{house_number}, {postcode}"
32+
# Use house_number as full address since it contains the complete address
33+
full_address = house_number if house_number else f"{house_number}, {postcode}"
3434
web_driver = kwargs.get("web_driver")
3535
headless = kwargs.get("headless")
3636

3737
# Create Selenium webdriver
3838
driver = create_webdriver(web_driver, headless, None, __name__)
3939
driver.get(page)
4040

41-
# If you bang in the house number (or property name) and postcode in the box it should find your property
41+
# Wait for page to load completely
4242
wait = WebDriverWait(driver, 60)
43-
address_entry_field = wait.until(
44-
EC.element_to_be_clickable((By.XPATH, '//*[@id="combobox-input-22"]'))
45-
)
46-
47-
address_entry_field.send_keys(str(full_address))
48-
49-
address_entry_field = wait.until(
50-
EC.element_to_be_clickable((By.XPATH, '//*[@id="combobox-input-22"]'))
51-
)
52-
address_entry_field.click()
53-
address_entry_field.send_keys(Keys.BACKSPACE)
54-
address_entry_field.send_keys(str(full_address[len(full_address) - 1]))
55-
56-
first_found_address = wait.until(
57-
EC.element_to_be_clickable(
58-
(By.XPATH, '//*[@id="dropdown-element-22"]/ul')
59-
)
60-
)
61-
62-
first_found_address.click()
63-
# Wait for the 'Select your property' dropdown to appear and select the first result
64-
next_btn = wait.until(
65-
EC.element_to_be_clickable((By.XPATH, "//lightning-button/button"))
66-
)
67-
next_btn.click()
68-
bin_data = wait.until(
69-
EC.presence_of_element_located(
70-
(By.XPATH, "//span[contains(text(), 'Container')]")
71-
)
72-
)
73-
43+
44+
# Wait for the Salesforce Lightning page to be fully loaded
45+
print("Waiting for Salesforce Lightning components to load...")
46+
time.sleep(10)
47+
48+
# Wait for the address input field to be present
49+
try:
50+
wait.until(EC.presence_of_element_located((By.XPATH, "//label[contains(text(), 'Enter your address')]")))
51+
print("Address label found")
52+
time.sleep(5) # Additional wait for the input field to be ready
53+
except Exception as e:
54+
print(f"Address label not found: {e}")
55+
56+
# Find the address input field using the label
57+
try:
58+
address_entry_field = driver.find_element(By.XPATH, "//label[contains(text(), 'Enter your address')]/following-sibling::*//input")
59+
print("Found address input field using label xpath")
60+
except Exception as e:
61+
print(f"Could not find address input field: {e}")
62+
raise Exception("Could not find address input field")
63+
64+
# Clear any existing text and enter the address
65+
try:
66+
address_entry_field.clear()
67+
address_entry_field.send_keys(str(full_address))
68+
print(f"Entered address: {full_address}")
69+
except Exception as e:
70+
print(f"Error entering address: {e}")
71+
raise
72+
73+
# Click the input field again to trigger the dropdown
74+
try:
75+
address_entry_field.click()
76+
print("Clicked input field to trigger dropdown")
77+
time.sleep(3) # Wait for dropdown to appear
78+
except Exception as e:
79+
print(f"Error clicking input field: {e}")
80+
81+
# Wait for and click the dropdown option
82+
try:
83+
dropdown_wait = WebDriverWait(driver, 10)
84+
dropdown_option = dropdown_wait.until(EC.element_to_be_clickable((By.XPATH, "//li[@role='presentation']")))
85+
dropdown_option.click()
86+
print("Clicked dropdown option")
87+
time.sleep(2)
88+
except Exception as e:
89+
print(f"Error clicking dropdown option: {e}")
90+
raise
91+
92+
# Find and click the Next button
93+
try:
94+
next_wait = WebDriverWait(driver, 10)
95+
next_button = next_wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Next')]")))
96+
next_button.click()
97+
print("Clicked Next button")
98+
time.sleep(5) # Wait for the bin collection data to load
99+
except Exception as e:
100+
print(f"Error clicking Next button: {e}")
101+
raise
102+
103+
# Wait for the bin collection data table to load
104+
try:
105+
table_wait = WebDriverWait(driver, 15)
106+
table_wait.until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'Collection Day')]")))
107+
print("Bin collection data table loaded")
108+
time.sleep(3)
109+
except Exception as e:
110+
print(f"Bin collection table not found: {e}")
111+
74112
soup = BeautifulSoup(driver.page_source, features="html.parser")
75-
76-
rows = soup.find_all("tr", class_="slds-hint-parent")
77113
current_year = datetime.now().year
78114

115+
# Try multiple approaches to find bin collection data
116+
rows = []
117+
118+
# Try different table row selectors
119+
table_selectors = [
120+
"tr.slds-hint-parent",
121+
"tr[class*='slds']",
122+
"table tr",
123+
".slds-table tr",
124+
"tbody tr"
125+
]
126+
127+
for selector in table_selectors:
128+
rows = soup.select(selector)
129+
if rows:
130+
break
131+
132+
# If no table rows found, try to find any elements containing collection info
133+
if not rows:
134+
# Look for any elements that might contain bin collection information
135+
collection_elements = soup.find_all(text=re.compile(r'(bin|collection|waste|recycling)', re.I))
136+
if collection_elements:
137+
# Try to extract information from the surrounding elements
138+
for element in collection_elements[:10]: # Limit to first 10 matches
139+
parent = element.parent
140+
if parent:
141+
text = parent.get_text().strip()
142+
if text and len(text) > 10: # Only consider substantial text
143+
# Try to extract date patterns
144+
date_patterns = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{1,2}\s+\w+\s+\d{4}\b', text)
145+
if date_patterns:
146+
data["bins"].append({
147+
"type": "General Collection",
148+
"collectionDate": date_patterns[0]
149+
})
150+
break
151+
152+
# Process table rows if found
79153
for row in rows:
80-
columns = row.find_all("td")
81-
if columns:
82-
container_type = row.find("th").text.strip()
83-
if columns[0].get_text() == "Today":
84-
collection_day = datetime.now().strftime("%a, %d %B")
85-
elif columns[0].get_text() == "Tomorrow":
86-
collection_day = (datetime.now() + timedelta(days=1)).strftime(
87-
"%a, %d %B"
88-
)
89-
else:
90-
collection_day = re.sub(
91-
r"[^a-zA-Z0-9,\s]", "", columns[0].get_text()
92-
).strip()
154+
try:
155+
columns = row.find_all(["td", "th"])
156+
if len(columns) >= 2:
157+
# Try to identify container type and date
158+
container_type = "Unknown"
159+
collection_date = ""
160+
161+
# Look for header cell (th) for container type
162+
th_element = row.find("th")
163+
if th_element:
164+
container_type = th_element.get_text().strip()
165+
elif columns:
166+
# If no th, use first column as type
167+
container_type = columns[0].get_text().strip()
168+
169+
# Look for date in subsequent columns
170+
for col in columns[1:] if th_element else columns[1:]:
171+
col_text = col.get_text().strip()
172+
if col_text:
173+
if col_text.lower() == "today":
174+
collection_date = datetime.now().strftime("%d/%m/%Y")
175+
break
176+
elif col_text.lower() == "tomorrow":
177+
collection_date = (datetime.now() + timedelta(days=1)).strftime("%d/%m/%Y")
178+
break
179+
else:
180+
# Try to parse various date formats
181+
try:
182+
# Clean the text
183+
clean_text = re.sub(r"[^a-zA-Z0-9,\s/-]", "", col_text).strip()
184+
185+
# Try different date parsing approaches
186+
date_formats = [
187+
"%a, %d %B",
188+
"%d %B %Y",
189+
"%d/%m/%Y",
190+
"%d-%m-%Y",
191+
"%B %d, %Y"
192+
]
193+
194+
for fmt in date_formats:
195+
try:
196+
parsed_date = datetime.strptime(clean_text, fmt)
197+
if fmt == "%a, %d %B": # Add year if missing
198+
if parsed_date.replace(year=current_year) < datetime.now():
199+
parsed_date = parsed_date.replace(year=current_year + 1)
200+
else:
201+
parsed_date = parsed_date.replace(year=current_year)
202+
collection_date = parsed_date.strftime("%d/%m/%Y")
203+
break
204+
except ValueError:
205+
continue
206+
207+
if collection_date:
208+
break
209+
except Exception:
210+
continue
211+
212+
# Add to data if we have both type and date
213+
if container_type and collection_date and container_type.lower() != "unknown":
214+
data["bins"].append({
215+
"type": container_type,
216+
"collectionDate": collection_date
217+
})
218+
except Exception as e:
219+
print(f"Error processing row: {e}")
220+
continue
221+
222+
# If no data found, add a debug entry
223+
if not data["bins"]:
224+
print("No bin collection data found. Page source:")
225+
print(driver.page_source[:1000]) # Print first 1000 chars for debugging
93226

94-
# Parse the date from the string
95-
parsed_date = datetime.strptime(collection_day, "%a, %d %B")
96-
if parsed_date < datetime(
97-
parsed_date.year, parsed_date.month, parsed_date.day
98-
):
99-
parsed_date = parsed_date.replace(year=current_year + 1)
100-
else:
101-
parsed_date = parsed_date.replace(year=current_year)
102-
# Format the date as %d/%m/%Y
103-
formatted_date = parsed_date.strftime("%d/%m/%Y")
104-
105-
# Add the bin type and collection date to the 'data' dictionary
106-
data["bins"].append(
107-
{"type": container_type, "collectionDate": formatted_date}
108-
)
109227
except Exception as e:
110228
# Here you can log the exception if needed
111229
print(f"An error occurred: {e}")
230+
print(f"Full address used: {full_address}")
231+
print(f"Page URL: {page}")
232+
# Add some debug information
233+
if driver:
234+
print(f"Current page title: {driver.title}")
235+
print(f"Current URL: {driver.current_url}")
112236
# Optionally, re-raise the exception if you want it to propagate
113237
raise
114238
finally:
115239
# This block ensures that the driver is closed regardless of an exception
116240
if driver:
117241
driver.quit()
118-
return data
242+
return data

uk_bin_collection/uk_bin_collection/councils/CoventryCityCouncil.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,21 @@ def parse_data(self, page: str, **kwargs) -> dict:
1818

1919
bindata = {"bins": []}
2020
curr_date = datetime.today()
21+
22+
headers = {
23+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
24+
}
2125

2226
soup = BeautifulSoup(page.content, features="html.parser")
2327
button = soup.find(
2428
"a",
2529
text="Find out which bin will be collected when and sign up for a free email reminder.",
2630
)
2731

28-
if button["href"]:
32+
if button and button.get("href"):
2933
URI = button["href"]
3034
# Make the GET request
31-
response = requests.get(URI)
35+
response = requests.get(URI, headers=headers)
3236
soup = BeautifulSoup(response.content, features="html.parser")
3337
divs = soup.find_all("div", {"class": "editor"})
3438
for div in divs:

uk_bin_collection/uk_bin_collection/get_bin_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def get_data(cls, url) -> str:
121121
urllib3.disable_warnings(category=urllib3.exceptions.InsecureRequestWarning)
122122

123123
try:
124-
full_page = requests.get(url, headers, verify=False, timeout=120)
124+
full_page = requests.get(url, headers=headers, verify=False, timeout=120)
125125
return full_page
126126
except requests.exceptions.RequestException as err:
127127
_LOGGER.error(f"Request Error: {err}")

0 commit comments

Comments
 (0)