1
- import requests
1
+ from typing import Dict , List , Any , Optional
2
2
from bs4 import BeautifulSoup
3
-
4
- from uk_bin_collection .uk_bin_collection .common import *
3
+ from dateutil .relativedelta import relativedelta
4
+ import requests
5
+ import re
6
+ from datetime import datetime
7
+ from uk_bin_collection .uk_bin_collection .common import check_uprn , check_postcode , date_format
5
8
from uk_bin_collection .uk_bin_collection .get_bin_data import AbstractGetBinDataClass
9
+ from dateutil .parser import parse
6
10
7
11
8
- # import the wonderful Beautiful Soup and the URL grabber
9
12
class CouncilClass (AbstractGetBinDataClass ):
10
- """
11
- Concrete classes have to implement all abstract operations of the
12
- base class. They can also override some operations with a default
13
- implementation.
14
- """
13
+ def get_data (self , url : str ) -> str :
14
+ # This method is not used in the current implementation
15
+ return ""
15
16
16
- def parse_data (self , page : str , ** kwargs ) -> dict :
17
+ def parse_data (self , page : str , ** kwargs : Any ) -> Dict [str , List [Dict [str , str ]]]:
18
+ postcode : Optional [str ] = kwargs .get ("postcode" )
19
+ uprn : Optional [str ] = kwargs .get ("uprn" )
17
20
18
- user_uprn = kwargs .get ("uprn" )
19
- user_postcode = kwargs .get ("postcode" )
20
- check_uprn (user_uprn )
21
- check_postcode (user_postcode )
22
- bindata = {"bins" : []}
21
+ if postcode is None or uprn is None :
22
+ raise ValueError ("Both postcode and UPRN are required." )
23
23
24
- session_uri = "https://forms.chorleysouthribble.gov.uk/xfp/form/70"
25
- URI = "https://forms.chorleysouthribble.gov.uk/xfp/form/70#qc576c657112a8277ba6f954ebc0490c946168363_0"
24
+ check_postcode ( postcode )
25
+ check_uprn ( uprn )
26
26
27
27
session = requests .Session ()
28
- token_response = session .get (session_uri )
29
- soup = BeautifulSoup (token_response .text , "html.parser" )
30
- token = soup .find ("input" , {"name" : "__token" }).attrs ["value" ]
31
-
32
- form_data = {
33
- "__token" : token ,
34
- "page" : "196" ,
35
- "locale" : "en_GB" ,
36
- "qc576c657112a8277ba6f954ebc0490c946168363_0_0" : user_postcode ,
37
- "qc576c657112a8277ba6f954ebc0490c946168363_1_0" : user_uprn ,
38
- "next" : "Next" ,
28
+ headers = {
29
+ "User-Agent" : (
30
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
31
+ "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
32
+ )
39
33
}
34
+ session .headers .update (headers )
35
+
36
+ # Step 1: Load form and get token + field names
37
+ initial_url = "https://forms.chorleysouthribble.gov.uk/xfp/form/70"
38
+ get_resp = session .get (initial_url )
39
+ soup = BeautifulSoup (get_resp .text , "html.parser" )
40
+
41
+ token = soup .find ("input" , {"name" : "__token" })["value" ]
42
+ page_id = soup .find ("input" , {"name" : "page" })["value" ]
43
+ postcode_field = soup .find ("input" , {"type" : "text" , "name" : re .compile (".*_0_0" )})["name" ]
44
+
45
+ # Step 2: Submit postcode
46
+ post_resp = session .post (
47
+ initial_url ,
48
+ data = {
49
+ "__token" : token ,
50
+ "page" : page_id ,
51
+ "locale" : "en_GB" ,
52
+ postcode_field : postcode ,
53
+ "next" : "Next" ,
54
+ },
55
+ )
40
56
41
- collection_response = session .post (URI , data = form_data )
57
+ soup = BeautifulSoup (post_resp .text , "html.parser" )
58
+ token = soup .find ("input" , {"name" : "__token" })["value" ]
59
+ address_field_el = soup .find ("select" , {"name" : re .compile (".*_1_0" )})
60
+ if not address_field_el :
61
+ raise ValueError ("Failed to find address dropdown after postcode submission." )
42
62
43
- #collection_soup = BeautifulSoup(collection_response.text, "html.parser")
44
-
63
+ address_field = address_field_el ["name" ]
45
64
46
- soup = BeautifulSoup (collection_response .text , "html.parser" )
47
- #print(soup)
65
+ # Step 3: Submit UPRN and retrieve bin data
66
+ final_resp = session .post (
67
+ initial_url ,
68
+ data = {
69
+ "__token" : token ,
70
+ "page" : page_id ,
71
+ "locale" : "en_GB" ,
72
+ postcode_field : postcode ,
73
+ address_field : uprn ,
74
+ "next" : "Next" ,
75
+ },
76
+ )
48
77
49
- rows = soup .find ("table" ).find_all ("tr" )
78
+ soup = BeautifulSoup (final_resp .text , "html.parser" )
79
+ table = soup .find ("table" , class_ = "data-table" )
80
+ if not table :
81
+ raise ValueError ("Could not find bin collection table." )
50
82
51
- # Form a JSON wrapper
83
+ rows = table . find ( "tbody" ). find_all ( "tr" )
52
84
data : Dict [str , List [Dict [str , str ]]] = {"bins" : []}
53
85
54
- # Loops the Rows
86
+ # Extract bin type mapping from JavaScript
87
+ bin_type_map = {}
88
+ scripts = soup .find_all ("script" , type = "text/javascript" )
89
+ for script in scripts :
90
+ if script .string and "const bintype = {" in script .string :
91
+ match = re .search (r'const bintype = \{([^}]+)\}' , script .string , re .DOTALL )
92
+ if match :
93
+ bintype_content = match .group (1 )
94
+ for line in bintype_content .split ('\n ' ):
95
+ line = line .strip ()
96
+ if '"' in line and ':' in line :
97
+ parts = line .split (':' , 1 )
98
+ if len (parts ) == 2 :
99
+ key = parts [0 ].strip ().strip ('"' ).strip ("'" )
100
+ value = parts [1 ].strip ().rstrip (',' ).strip ().strip ('"' ).strip ("'" )
101
+ bin_type_map [key ] = value
102
+ break
103
+
55
104
for row in rows :
56
105
cells = row .find_all ("td" )
57
-
58
- if cells :
59
- bin_type = cells [0 ].get_text (strip = True )
60
- collection_next = cells [1 ].get_text (strip = True )
61
-
62
- if len (collection_next ) != 1 :
63
- collection_date_obj = datetime .strptime (collection_next , "%d/%m/%y" ).date ()
64
- # since we only have the next collection day, if the parsed date is in the past,
65
- # assume the day is instead next month
66
- if collection_date_obj < datetime .now ().date ():
67
- collection_date_obj += relativedelta (months = 1 )
68
- # Make each Bin element in the JSON
69
- dict_data = {
106
+ if len (cells ) >= 2 :
107
+ bin_type_cell = cells [0 ]
108
+ bin_type = bin_type_cell .get_text (strip = True )
109
+ bin_type = bin_type_map .get (bin_type , bin_type )
110
+
111
+ date_text = cells [1 ].get_text (strip = True )
112
+ date_parts = date_text .split (", " )
113
+ date_str = date_parts [1 ] if len (date_parts ) == 2 else date_text
114
+
115
+ try :
116
+ day , month , year = date_str .split ('/' )
117
+ year = int (year )
118
+ if year < 100 :
119
+ year = 2000 + year
120
+
121
+ date_obj = datetime (year , int (month ), int (day )).date ()
122
+
123
+ data ["bins" ].append ({
70
124
"type" : bin_type ,
71
- "collectionDate" : collection_date_obj .strftime ("%d/%m/%Y" ),
72
- }
73
- # Add data to the main JSON Wrapper
74
- data [ "bins" ]. append ( dict_data )
75
- continue
125
+ "collectionDate" : date_obj .strftime (date_format )
126
+ })
127
+ except Exception :
128
+ continue
129
+
76
130
return data
0 commit comments