1
- import time
2
1
import re
3
- import requests
2
+ import time
4
3
from datetime import datetime
4
+
5
+ import requests
5
6
from bs4 import BeautifulSoup
6
7
from selenium .webdriver .common .by import By
7
8
from selenium .webdriver .common .keys import Keys
8
9
from selenium .webdriver .support import expected_conditions as EC
9
10
from selenium .webdriver .support .ui import WebDriverWait
11
+
10
12
from uk_bin_collection .uk_bin_collection .common import *
11
13
from uk_bin_collection .uk_bin_collection .get_bin_data import AbstractGetBinDataClass
12
14
15
+
13
16
def get_street_from_postcode (postcode : str , api_key : str ) -> str :
14
17
url = "https://maps.googleapis.com/maps/api/geocode/json"
15
18
params = {"address" : postcode , "key" : api_key }
@@ -25,6 +28,7 @@ def get_street_from_postcode(postcode: str, api_key: str) -> str:
25
28
26
29
raise ValueError ("No street (route) found in the response." )
27
30
31
+
28
32
class CouncilClass (AbstractGetBinDataClass ):
29
33
def parse_data (self , page : str , ** kwargs ) -> dict :
30
34
driver = None
@@ -37,10 +41,10 @@ def parse_data(self, page: str, **kwargs) -> dict:
37
41
38
42
headless = kwargs .get ("headless" )
39
43
web_driver = kwargs .get ("web_driver" )
40
- driver = create_webdriver (web_driver , headless , None , __name__ )
44
+ UserAgent = "Mozilla/5.0"
45
+ driver = create_webdriver (web_driver , headless , UserAgent , __name__ )
41
46
page = "https://www.slough.gov.uk/bin-collections"
42
47
driver .get (page )
43
-
44
48
# Accept cookies
45
49
WebDriverWait (driver , 10 ).until (
46
50
EC .element_to_be_clickable ((By .ID , "ccc-recommended-settings" ))
@@ -50,14 +54,20 @@ def parse_data(self, page: str, **kwargs) -> dict:
50
54
address_input = WebDriverWait (driver , 10 ).until (
51
55
EC .presence_of_element_located ((By .ID , "keyword_directory25" ))
52
56
)
53
- user_address = get_street_from_postcode (user_postcode , "AIzaSyBDLULT7EIlNtHerswPtfmL15Tt3Oc0bV8" )
57
+ user_address = get_street_from_postcode (
58
+ user_postcode , "AIzaSyBDLULT7EIlNtHerswPtfmL15Tt3Oc0bV8"
59
+ )
54
60
address_input .send_keys (user_address + Keys .ENTER )
55
61
56
62
# Wait for address results to load
57
63
WebDriverWait (driver , 10 ).until (
58
- EC .presence_of_all_elements_located ((By .CSS_SELECTOR , "span.list__link-text" ))
64
+ EC .presence_of_all_elements_located (
65
+ (By .CSS_SELECTOR , "span.list__link-text" )
66
+ )
67
+ )
68
+ span_elements = driver .find_elements (
69
+ By .CSS_SELECTOR , "span.list__link-text"
59
70
)
60
- span_elements = driver .find_elements (By .CSS_SELECTOR , "span.list__link-text" )
61
71
62
72
for span in span_elements :
63
73
if user_address .lower () in span .text .lower ():
@@ -68,7 +78,9 @@ def parse_data(self, page: str, **kwargs) -> dict:
68
78
69
79
# Wait for address detail page
70
80
WebDriverWait (driver , 10 ).until (
71
- EC .presence_of_element_located ((By .CSS_SELECTOR , "section.site-content" ))
81
+ EC .presence_of_element_located (
82
+ (By .CSS_SELECTOR , "section.site-content" )
83
+ )
72
84
)
73
85
soup = BeautifulSoup (driver .page_source , "html.parser" )
74
86
@@ -86,28 +98,33 @@ def parse_data(self, page: str, **kwargs) -> dict:
86
98
bin_url = "https://www.slough.gov.uk" + bin_url
87
99
88
100
# Visit the child page
89
- print (f"Navigating to { bin_url } " )
101
+ # print(f"Navigating to {bin_url}")
90
102
driver .get (bin_url )
91
103
WebDriverWait (driver , 10 ).until (
92
- EC .presence_of_element_located ((By .CSS_SELECTOR , "div.page-content" ))
104
+ EC .presence_of_element_located (
105
+ (By .CSS_SELECTOR , "div.page-content" )
106
+ )
93
107
)
94
108
child_soup = BeautifulSoup (driver .page_source , "html.parser" )
95
109
96
110
editor_div = child_soup .find ("div" , class_ = "editor" )
97
111
if not editor_div :
98
- print ("No editor div found on bin detail page." )
112
+ # print("No editor div found on bin detail page.")
99
113
continue
100
114
101
115
ul = editor_div .find ("ul" )
102
116
if not ul :
103
- print ("No <ul> with dates found in editor div." )
117
+ # print("No <ul> with dates found in editor div.")
104
118
continue
105
119
106
120
for li in ul .find_all ("li" ):
107
121
raw_text = li .get_text (strip = True ).replace ("." , "" )
108
122
109
- if "no collection" in raw_text .lower () or "no collections" in raw_text .lower ():
110
- print (f"Ignoring non-collection note: { raw_text } " )
123
+ if (
124
+ "no collection" in raw_text .lower ()
125
+ or "no collections" in raw_text .lower ()
126
+ ):
127
+ # print(f"Ignoring non-collection note: {raw_text}")
111
128
continue
112
129
113
130
raw_date = raw_text
@@ -117,24 +134,25 @@ def parse_data(self, page: str, **kwargs) -> dict:
117
134
except ValueError :
118
135
raw_date_cleaned = raw_date .split ("(" )[0 ].strip ()
119
136
try :
120
- parsed_date = datetime .strptime (raw_date_cleaned , "%d %B %Y" )
137
+ parsed_date = datetime .strptime (
138
+ raw_date_cleaned , "%d %B %Y"
139
+ )
121
140
except Exception :
122
141
print (f"Could not parse date: { raw_text } " )
123
142
continue
124
143
125
144
formatted_date = parsed_date .strftime ("%d/%m/%Y" )
126
145
contains_date (formatted_date )
127
- bin_data ["bins" ].append ({
128
- "type" : bin_type ,
129
- "collectionDate" : formatted_date
130
- })
146
+ bin_data ["bins" ].append (
147
+ {"type" : bin_type , "collectionDate" : formatted_date }
148
+ )
131
149
132
- print (f"Type: { bin_type } , Date: { formatted_date } " )
150
+ # print(f"Type: {bin_type}, Date: {formatted_date}")
133
151
134
152
except Exception as e :
135
153
print (f"An error occurred: { e } " )
136
154
raise
137
155
finally :
138
156
if driver :
139
157
driver .quit ()
140
- return bin_data
158
+ return bin_data
0 commit comments