Skip to content

Commit 2246571

Browse files
committed
fix: #1584 - NorthHertfordshireDistrictCouncil
fix: #1584 - NorthHertfordshireDistrictCouncil
1 parent e86d54c commit 2246571

File tree

1 file changed

+26
-128
lines changed

1 file changed

+26
-128
lines changed

uk_bin_collection/uk_bin_collection/councils/NorthHertfordshireDistrictCouncil.py

Lines changed: 26 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -125,146 +125,44 @@ def parse_data(self, page: str, **kwargs) -> dict:
125125
# Wait for the page to load - giving it extra time
126126
time.sleep(5)
127127

128-
# Use only the selector that we know works
129-
# print("Looking for bin type elements...")
130-
try:
131-
bin_type_selector = (
132-
By.CSS_SELECTOR,
133-
"div.formatting_bold.formatting_size_bigger.formatting span.value-as-text",
134-
)
135-
WebDriverWait(driver, 15).until(
136-
EC.presence_of_element_located(bin_type_selector)
137-
)
138-
# print(f"Found bin type elements with selector: {bin_type_selector}")
139-
except TimeoutException:
140-
# print("Could not find bin type elements. Taking screenshot for debugging...")
141-
screenshot_path = f"bin_type_error_{int(time.time())}.png"
142-
driver.save_screenshot(screenshot_path)
143-
# print(f"Screenshot saved to {screenshot_path}")
144-
145128
# Create BS4 object from driver's page source
146129
# print("Parsing page with BeautifulSoup...")
147130
soup = BeautifulSoup(driver.page_source, features="html.parser")
148131

149132
# Initialize data dictionary
150133
data = {"bins": []}
151134

152-
# Looking for bin types in the exact HTML structure
153-
bin_type_elements = soup.select(
154-
"div.page_cell.contains_widget:first-of-type div.formatting_bold.formatting_size_bigger.formatting span.value-as-text"
155-
)
156-
# print(f"Found {len(bin_type_elements)} bin type elements")
157-
158-
# Look specifically for date elements with the exact structure
159-
date_elements = soup.select("div.col-sm-12.font-xs-3xl span.value-as-text")
160-
hidden_dates = soup.select(
161-
"div.col-sm-12.font-xs-3xl input[type='hidden'][value*='/']"
162-
)
163-
164-
# print(f"Found {len(bin_type_elements)} bin types and {len(date_elements)} date elements")
165-
166-
# We need a smarter way to match bin types with their dates
167-
bin_count = 0
135+
for row in soup.select(".listing_template_row"):
136+
# Title (waste stream) is the first <p> in the section
137+
first_p = row.find("p")
138+
if not first_p:
139+
continue
140+
stream = first_p.get_text(" ", strip=True)
168141

169-
# Map of bin types to their collection dates
170-
bin_date_map = {}
142+
for p in row.find_all("p"):
143+
t = p.get_text("\n", strip=True)
171144

172-
# Extract all date strings that look like actual dates
173-
date_texts = []
174-
date_pattern = re.compile(
175-
r"(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+\d+(?:st|nd|rd|th)?\s+\w+\s+\d{4}",
176-
re.IGNORECASE,
177-
)
145+
if re.search(r"\bNext collection\b", t, flags=re.I):
146+
# Expect format: "Next collection\nTuesday 16th September 2025"
147+
parts = [x.strip() for x in t.split("\n") if x.strip()]
148+
if len(parts) >= 2:
149+
next_collection_display = parts[-1] # last line
178150

179-
for element in date_elements:
180-
text = element.get_text(strip=True)
181-
if date_pattern.search(text):
182-
date_texts.append(text)
183-
# print(f"Found valid date text: {text}")
184-
185-
# Find hidden date inputs with values in DD/MM/YYYY format
186-
hidden_date_values = []
187-
for hidden in hidden_dates:
188-
value = hidden.get("value", "")
189-
if re.match(r"\d{1,2}/\d{1,2}/\d{4}", value):
190-
hidden_date_values.append(value)
191-
# print(f"Found hidden date value: {value}")
192-
193-
# When filtering date elements
194-
date_elements = soup.select("div.col-sm-12.font-xs-3xl span.value-as-text")
195-
valid_date_elements = []
196-
197-
for element in date_elements:
198-
text = element.get_text(strip=True)
199-
if contains_date(text):
200-
valid_date_elements.append(element)
201-
# print(f"Found valid date element: {text}")
202-
else:
203-
pass
204-
# print(f"Skipping non-date element: {text}")
205-
206-
# print(f"Found {len(bin_type_elements)} bin types and {len(valid_date_elements)} valid date elements")
207-
208-
# When processing each bin type
209-
for i, bin_type_elem in enumerate(bin_type_elements):
210-
bin_type = bin_type_elem.get_text(strip=True)
211-
212-
# Try to find a date for this bin type
213-
date_text = None
214-
215-
# Look for a valid date element
216-
if i < len(valid_date_elements):
217-
date_elem = valid_date_elements[i]
218-
date_text = date_elem.get_text(strip=True)
219-
220-
# If we don't have a valid date yet, try using the hidden input
221-
if not date_text or not contains_date(date_text):
222-
if i < len(hidden_dates):
223-
date_value = hidden_dates[i].get("value")
224-
if contains_date(date_value):
225-
date_text = date_value
226-
227-
# Skip if we don't have a valid date
228-
if not date_text or not contains_date(date_text):
229-
# print(f"No valid date found for bin type: {bin_type}")
230-
continue
151+
# Build record
152+
next_date = datetime.strptime(
153+
remove_ordinal_indicator_from_date_string(next_collection_display),
154+
"%A %d %B %Y",
155+
)
231156

232-
# print(f"Found bin type: {bin_type} with date: {date_text}")
157+
# Create bin entry
158+
bin_entry = {
159+
"type": stream,
160+
"collectionDate": next_date.strftime(date_format),
161+
}
233162

234-
try:
235-
# Clean up the date text
236-
date_text = remove_ordinal_indicator_from_date_string(date_text)
237-
238-
# Try to parse the date
239-
try:
240-
collection_date = datetime.strptime(
241-
date_text, "%A %d %B %Y"
242-
).date()
243-
except ValueError:
244-
try:
245-
collection_date = datetime.strptime(
246-
date_text, "%d/%m/%Y"
247-
).date()
248-
except ValueError:
249-
# Last resort
250-
collection_date = parse(date_text).date()
251-
252-
# Create bin entry
253-
bin_entry = {
254-
"type": bin_type,
255-
"collectionDate": collection_date.strftime(date_format),
256-
}
257-
258-
# Add to data
259-
data["bins"].append(bin_entry)
260-
bin_count += 1
261-
# print(f"Added bin entry: {bin_entry}")
262-
263-
except Exception as e:
264-
pass
265-
# print(f"Error parsing date '{date_text}': {str(e)}")
266-
267-
# print(f"Successfully parsed {bin_count} bin collections")
163+
# Add to data
164+
data["bins"].append(bin_entry)
165+
# print(f"Added bin entry: {bin_entry}")
268166

269167
if not data["bins"]:
270168
# print("No bin data found. Saving page for debugging...")

0 commit comments

Comments
 (0)