1+ from xml .etree import ElementTree
2+
13import requests
24from requests .adapters import HTTPAdapter
35from requests .packages .urllib3 .util .retry import Retry
1012)
1113
1214
15+ def remove_troublesome_chars (string : str ):
16+ """Remove chars that cause trouble when pushed into postgres."""
17+ if type (string ) is not str :
18+ return string
19+ troublesome_chars = {'"' : "" , "'" : "" , "\n " : " " }
20+ for k , v in troublesome_chars .items ():
21+ string = string .replace (k , v )
22+ return string
23+
24+
1325def retry_get (url , retries = 3 , timeout = 4 ):
26+ """Retry a query for a variable amount of tries."""
1427 retry = Retry (total = retries )
15- session = requests .Session ()
16- session .mount ("https://" , HTTPAdapter (max_retries = retry ))
17- return session .get (url , timeout = timeout )
28+ with requests .Session () as session :
29+ session .mount ("https://" , HTTPAdapter (max_retries = retry ))
30+ return session .get (url , timeout = timeout )
1831
1932
2033def geojsonToFeatureCollection (geojson : dict ) -> dict :
34+ """Take a GeoJson and wrap it in a FeatureCollection."""
2135 if geojson ["type" ] != "FeatureCollection" :
2236 collection = {
2337 "type" : "FeatureCollection" ,
@@ -27,33 +41,57 @@ def geojsonToFeatureCollection(geojson: dict) -> dict:
2741 return geojson
2842
2943
30- def query_osm (changeset_id : int ):
44+ def chunks (arr , n_objects ):
45+ """Return a list of list with n_objects in each sublist."""
46+ return [
47+ arr [i * n_objects : (i + 1 ) * n_objects ]
48+ for i in range ((len (arr ) + n_objects - 1 ) // n_objects )
49+ ]
50+
51+
52+ def query_osm (changeset_ids : list , changeset_results ):
3153 """Get data from changesetId."""
32- url = OSM_API_LINK + f"changeset/ { changeset_id } .json"
54+ id_string = "," . join ( map ( str , changeset_ids ))
3355
56+ url = OSM_API_LINK + f"changesets?changesets={ id_string } "
3457 response = retry_get (url )
35-
3658 if response .status_code != 200 :
3759 err = f"osm request failed: { response .status_code } "
3860 logger .warning (f"{ err } " )
3961 logger .warning (response .json ())
4062 raise CustomError (err )
41- response = response .json ()["elements" ][0 ]
42- return response
63+ tree = ElementTree .fromstring (response .content )
64+
65+ for changeset in tree .iter ("changeset" ):
66+ id = changeset .attrib ["id" ]
67+ username = remove_troublesome_chars (changeset .attrib ["user" ])
68+ userid = changeset .attrib ["uid" ]
69+ comment = created_by = None
70+ for tag in changeset .iter ("tag" ):
71+ if tag .attrib ["k" ] == "comment" :
72+ comment = tag .attrib ["v" ]
73+ if tag .attrib ["k" ] == "created_by" :
74+ created_by = tag .attrib ["v" ]
75+
76+ changeset_results [int (id )] = {
77+ "username" : remove_troublesome_chars (username ),
78+ "userid" : userid ,
79+ "comment" : remove_troublesome_chars (comment ),
80+ "created_by" : remove_troublesome_chars (created_by ),
81+ }
82+ return changeset_results
4383
4484
4585def remove_noise_and_add_user_info (json : dict ) -> dict :
4686 """Delete unwanted information from properties."""
4787 logger .info ("starting filtering and adding extra info" )
48-
4988 changeset_results = {}
89+
5090 missing_rows = {
5191 "@changesetId" : 0 ,
5292 "@lastEdit" : 0 ,
5393 "@osmId" : 0 ,
5494 "@version" : 0 ,
55- "created_by" : 0 ,
56- "hashtags" : 0 ,
5795 }
5896
5997 for feature in json ["features" ]:
@@ -64,30 +102,26 @@ def remove_noise_and_add_user_info(json: dict) -> dict:
64102 attribute
65103 ]
66104 except KeyError :
67- if attribute != "hashtags" :
68- missing_rows [attribute ] += 1
69- changeset_id = new_properties ["changesetId" ]
70-
71- # if changeset_id already queried, use stored result
72- if changeset_id not in changeset_results .keys ():
73- changeset_results [changeset_id ] = query_osm (changeset_id )
74- new_properties ["username" ] = changeset_results [changeset_id ]["user" ]
75- new_properties ["userid" ] = changeset_results [changeset_id ]["uid" ]
76- try :
77- new_properties ["hashtags" ] = changeset_results [changeset_id ]["tags" ][
78- "hashtags"
79- ]
80- except KeyError :
81- missing_rows ["hashtags" ] += 1
82-
83- try :
84- new_properties ["created_by" ] = changeset_results [changeset_id ]["tags" ][
85- "created_by"
86- ]
87- except KeyError :
88- missing_rows ["created_by" ] += 1
89-
105+ missing_rows [attribute ] += 1
106+ changeset_results [new_properties ["changesetId" ]] = None
90107 feature ["properties" ] = new_properties
108+
109+ len_osm = len (changeset_results .keys ())
110+ batches = int (len (changeset_results .keys ()) / 100 ) + 1
111+ logger .info (
112+ f"""{ len_osm } changesets will be queried in roughly { batches } batches"""
113+ )
114+ chunk_list = chunks (list (changeset_results .keys ()), 100 )
115+ for i , subset in enumerate (chunk_list ):
116+ changeset_results = query_osm (subset , changeset_results )
117+ progress = round (100 * ((i + 1 ) / len (chunk_list )), 1 )
118+ logger .info (f"finished query { i + 1 } /{ len (chunk_list )} , { progress } " )
119+
120+ for feature in json ["features" ]:
121+ changeset = changeset_results [feature ["properties" ]["changesetId" ]]
122+ for attribute_name in ["username" , "comment" , "created_by" , "userid" ]:
123+ feature ["properties" ][attribute_name ] = changeset [attribute_name ]
124+
91125 logger .info ("finished filtering and adding extra info" )
92126 if any (x > 0 for x in missing_rows .values ()):
93127 logger .warning (f"features missing values:\n { missing_rows } " )
@@ -96,9 +130,7 @@ def remove_noise_and_add_user_info(json: dict) -> dict:
96130
97131
98132def ohsome (request : dict , area : str , properties = None ) -> dict :
99- """
100- Request data from Ohsome API.
101- """
133+ """Request data from Ohsome API."""
102134 url = OHSOME_API_LINK + request ["endpoint" ]
103135 data = {"bpolys" : area , "filter" : request ["filter" ]}
104136 if properties :
0 commit comments