@@ -125,146 +125,44 @@ def parse_data(self, page: str, **kwargs) -> dict:
125
125
# Wait for the page to load - giving it extra time
126
126
time .sleep (5 )
127
127
128
- # Use only the selector that we know works
129
- # print("Looking for bin type elements...")
130
- try :
131
- bin_type_selector = (
132
- By .CSS_SELECTOR ,
133
- "div.formatting_bold.formatting_size_bigger.formatting span.value-as-text" ,
134
- )
135
- WebDriverWait (driver , 15 ).until (
136
- EC .presence_of_element_located (bin_type_selector )
137
- )
138
- # print(f"Found bin type elements with selector: {bin_type_selector}")
139
- except TimeoutException :
140
- # print("Could not find bin type elements. Taking screenshot for debugging...")
141
- screenshot_path = f"bin_type_error_{ int (time .time ())} .png"
142
- driver .save_screenshot (screenshot_path )
143
- # print(f"Screenshot saved to {screenshot_path}")
144
-
145
128
# Create BS4 object from driver's page source
146
129
# print("Parsing page with BeautifulSoup...")
147
130
soup = BeautifulSoup (driver .page_source , features = "html.parser" )
148
131
149
132
# Initialize data dictionary
150
133
data = {"bins" : []}
151
134
152
- # Looking for bin types in the exact HTML structure
153
- bin_type_elements = soup .select (
154
- "div.page_cell.contains_widget:first-of-type div.formatting_bold.formatting_size_bigger.formatting span.value-as-text"
155
- )
156
- # print(f"Found {len(bin_type_elements)} bin type elements")
157
-
158
- # Look specifically for date elements with the exact structure
159
- date_elements = soup .select ("div.col-sm-12.font-xs-3xl span.value-as-text" )
160
- hidden_dates = soup .select (
161
- "div.col-sm-12.font-xs-3xl input[type='hidden'][value*='/']"
162
- )
163
-
164
- # print(f"Found {len(bin_type_elements)} bin types and {len(date_elements)} date elements")
165
-
166
- # We need a smarter way to match bin types with their dates
167
- bin_count = 0
135
+ for row in soup .select (".listing_template_row" ):
136
+ # Title (waste stream) is the first <p> in the section
137
+ first_p = row .find ("p" )
138
+ if not first_p :
139
+ continue
140
+ stream = first_p .get_text (" " , strip = True )
168
141
169
- # Map of bin types to their collection dates
170
- bin_date_map = {}
142
+ for p in row . find_all ( "p" ):
143
+ t = p . get_text ( " \n " , strip = True )
171
144
172
- # Extract all date strings that look like actual dates
173
- date_texts = []
174
- date_pattern = re .compile (
175
- r"(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+\d+(?:st|nd|rd|th)?\s+\w+\s+\d{4}" ,
176
- re .IGNORECASE ,
177
- )
145
+ if re .search (r"\bNext collection\b" , t , flags = re .I ):
146
+ # Expect format: "Next collection\nTuesday 16th September 2025"
147
+ parts = [x .strip () for x in t .split ("\n " ) if x .strip ()]
148
+ if len (parts ) >= 2 :
149
+ next_collection_display = parts [- 1 ] # last line
178
150
179
- for element in date_elements :
180
- text = element .get_text (strip = True )
181
- if date_pattern .search (text ):
182
- date_texts .append (text )
183
- # print(f"Found valid date text: {text}")
184
-
185
- # Find hidden date inputs with values in DD/MM/YYYY format
186
- hidden_date_values = []
187
- for hidden in hidden_dates :
188
- value = hidden .get ("value" , "" )
189
- if re .match (r"\d{1,2}/\d{1,2}/\d{4}" , value ):
190
- hidden_date_values .append (value )
191
- # print(f"Found hidden date value: {value}")
192
-
193
- # When filtering date elements
194
- date_elements = soup .select ("div.col-sm-12.font-xs-3xl span.value-as-text" )
195
- valid_date_elements = []
196
-
197
- for element in date_elements :
198
- text = element .get_text (strip = True )
199
- if contains_date (text ):
200
- valid_date_elements .append (element )
201
- # print(f"Found valid date element: {text}")
202
- else :
203
- pass
204
- # print(f"Skipping non-date element: {text}")
205
-
206
- # print(f"Found {len(bin_type_elements)} bin types and {len(valid_date_elements)} valid date elements")
207
-
208
- # When processing each bin type
209
- for i , bin_type_elem in enumerate (bin_type_elements ):
210
- bin_type = bin_type_elem .get_text (strip = True )
211
-
212
- # Try to find a date for this bin type
213
- date_text = None
214
-
215
- # Look for a valid date element
216
- if i < len (valid_date_elements ):
217
- date_elem = valid_date_elements [i ]
218
- date_text = date_elem .get_text (strip = True )
219
-
220
- # If we don't have a valid date yet, try using the hidden input
221
- if not date_text or not contains_date (date_text ):
222
- if i < len (hidden_dates ):
223
- date_value = hidden_dates [i ].get ("value" )
224
- if contains_date (date_value ):
225
- date_text = date_value
226
-
227
- # Skip if we don't have a valid date
228
- if not date_text or not contains_date (date_text ):
229
- # print(f"No valid date found for bin type: {bin_type}")
230
- continue
151
+ # Build record
152
+ next_date = datetime .strptime (
153
+ remove_ordinal_indicator_from_date_string (next_collection_display ),
154
+ "%A %d %B %Y" ,
155
+ )
231
156
232
- # print(f"Found bin type: {bin_type} with date: {date_text}")
157
+ # Create bin entry
158
+ bin_entry = {
159
+ "type" : stream ,
160
+ "collectionDate" : next_date .strftime (date_format ),
161
+ }
233
162
234
- try :
235
- # Clean up the date text
236
- date_text = remove_ordinal_indicator_from_date_string (date_text )
237
-
238
- # Try to parse the date
239
- try :
240
- collection_date = datetime .strptime (
241
- date_text , "%A %d %B %Y"
242
- ).date ()
243
- except ValueError :
244
- try :
245
- collection_date = datetime .strptime (
246
- date_text , "%d/%m/%Y"
247
- ).date ()
248
- except ValueError :
249
- # Last resort
250
- collection_date = parse (date_text ).date ()
251
-
252
- # Create bin entry
253
- bin_entry = {
254
- "type" : bin_type ,
255
- "collectionDate" : collection_date .strftime (date_format ),
256
- }
257
-
258
- # Add to data
259
- data ["bins" ].append (bin_entry )
260
- bin_count += 1
261
- # print(f"Added bin entry: {bin_entry}")
262
-
263
- except Exception as e :
264
- pass
265
- # print(f"Error parsing date '{date_text}': {str(e)}")
266
-
267
- # print(f"Successfully parsed {bin_count} bin collections")
163
+ # Add to data
164
+ data ["bins" ].append (bin_entry )
165
+ # print(f"Added bin entry: {bin_entry}")
268
166
269
167
if not data ["bins" ]:
270
168
# print("No bin data found. Saving page for debugging...")
0 commit comments