Skip to content

Commit c352836

Browse files
authored
Merge pull request #505 from mapswipe/bundle_changesets
bundle changesets
2 parents 3397453 + adb4959 commit c352836

File tree

1 file changed

+69
-37
lines changed

1 file changed

+69
-37
lines changed

mapswipe_workers/mapswipe_workers/utils/api_calls.py

Lines changed: 69 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from xml.etree import ElementTree
2+
13
import requests
24
from requests.adapters import HTTPAdapter
35
from requests.packages.urllib3.util.retry import Retry
@@ -10,14 +12,26 @@
1012
)
1113

1214

15+
def remove_troublesome_chars(string: str):
16+
"""Remove chars that cause trouble when pushed into postgres."""
17+
if type(string) is not str:
18+
return string
19+
troublesome_chars = {'"': "", "'": "", "\n": " "}
20+
for k, v in troublesome_chars.items():
21+
string = string.replace(k, v)
22+
return string
23+
24+
1325
def retry_get(url, retries=3, timeout=4):
26+
"""Retry a query for a variable amount of tries."""
1427
retry = Retry(total=retries)
15-
session = requests.Session()
16-
session.mount("https://", HTTPAdapter(max_retries=retry))
17-
return session.get(url, timeout=timeout)
28+
with requests.Session() as session:
29+
session.mount("https://", HTTPAdapter(max_retries=retry))
30+
return session.get(url, timeout=timeout)
1831

1932

2033
def geojsonToFeatureCollection(geojson: dict) -> dict:
34+
"""Take a GeoJson and wrap it in a FeatureCollection."""
2135
if geojson["type"] != "FeatureCollection":
2236
collection = {
2337
"type": "FeatureCollection",
@@ -27,33 +41,57 @@ def geojsonToFeatureCollection(geojson: dict) -> dict:
2741
return geojson
2842

2943

30-
def query_osm(changeset_id: int):
44+
def chunks(arr, n_objects):
45+
"""Return a list of list with n_objects in each sublist."""
46+
return [
47+
arr[i * n_objects : (i + 1) * n_objects]
48+
for i in range((len(arr) + n_objects - 1) // n_objects)
49+
]
50+
51+
52+
def query_osm(changeset_ids: list, changeset_results):
3153
"""Get data from changesetId."""
32-
url = OSM_API_LINK + f"changeset/{changeset_id}.json"
54+
id_string = ",".join(map(str, changeset_ids))
3355

56+
url = OSM_API_LINK + f"changesets?changesets={id_string}"
3457
response = retry_get(url)
35-
3658
if response.status_code != 200:
3759
err = f"osm request failed: {response.status_code}"
3860
logger.warning(f"{err}")
3961
logger.warning(response.json())
4062
raise CustomError(err)
41-
response = response.json()["elements"][0]
42-
return response
63+
tree = ElementTree.fromstring(response.content)
64+
65+
for changeset in tree.iter("changeset"):
66+
id = changeset.attrib["id"]
67+
username = remove_troublesome_chars(changeset.attrib["user"])
68+
userid = changeset.attrib["uid"]
69+
comment = created_by = None
70+
for tag in changeset.iter("tag"):
71+
if tag.attrib["k"] == "comment":
72+
comment = tag.attrib["v"]
73+
if tag.attrib["k"] == "created_by":
74+
created_by = tag.attrib["v"]
75+
76+
changeset_results[int(id)] = {
77+
"username": remove_troublesome_chars(username),
78+
"userid": userid,
79+
"comment": remove_troublesome_chars(comment),
80+
"created_by": remove_troublesome_chars(created_by),
81+
}
82+
return changeset_results
4383

4484

4585
def remove_noise_and_add_user_info(json: dict) -> dict:
4686
"""Delete unwanted information from properties."""
4787
logger.info("starting filtering and adding extra info")
48-
4988
changeset_results = {}
89+
5090
missing_rows = {
5191
"@changesetId": 0,
5292
"@lastEdit": 0,
5393
"@osmId": 0,
5494
"@version": 0,
55-
"created_by": 0,
56-
"hashtags": 0,
5795
}
5896

5997
for feature in json["features"]:
@@ -64,30 +102,26 @@ def remove_noise_and_add_user_info(json: dict) -> dict:
64102
attribute
65103
]
66104
except KeyError:
67-
if attribute != "hashtags":
68-
missing_rows[attribute] += 1
69-
changeset_id = new_properties["changesetId"]
70-
71-
# if changeset_id already queried, use stored result
72-
if changeset_id not in changeset_results.keys():
73-
changeset_results[changeset_id] = query_osm(changeset_id)
74-
new_properties["username"] = changeset_results[changeset_id]["user"]
75-
new_properties["userid"] = changeset_results[changeset_id]["uid"]
76-
try:
77-
new_properties["hashtags"] = changeset_results[changeset_id]["tags"][
78-
"hashtags"
79-
]
80-
except KeyError:
81-
missing_rows["hashtags"] += 1
82-
83-
try:
84-
new_properties["created_by"] = changeset_results[changeset_id]["tags"][
85-
"created_by"
86-
]
87-
except KeyError:
88-
missing_rows["created_by"] += 1
89-
105+
missing_rows[attribute] += 1
106+
changeset_results[new_properties["changesetId"]] = None
90107
feature["properties"] = new_properties
108+
109+
len_osm = len(changeset_results.keys())
110+
batches = int(len(changeset_results.keys()) / 100) + 1
111+
logger.info(
112+
f"""{len_osm} changesets will be queried in roughly {batches} batches"""
113+
)
114+
chunk_list = chunks(list(changeset_results.keys()), 100)
115+
for i, subset in enumerate(chunk_list):
116+
changeset_results = query_osm(subset, changeset_results)
117+
progress = round(100 * ((i + 1) / len(chunk_list)), 1)
118+
logger.info(f"finished query {i+1}/{len(chunk_list)}, {progress}")
119+
120+
for feature in json["features"]:
121+
changeset = changeset_results[feature["properties"]["changesetId"]]
122+
for attribute_name in ["username", "comment", "created_by", "userid"]:
123+
feature["properties"][attribute_name] = changeset[attribute_name]
124+
91125
logger.info("finished filtering and adding extra info")
92126
if any(x > 0 for x in missing_rows.values()):
93127
logger.warning(f"features missing values:\n{missing_rows}")
@@ -96,9 +130,7 @@ def remove_noise_and_add_user_info(json: dict) -> dict:
96130

97131

98132
def ohsome(request: dict, area: str, properties=None) -> dict:
99-
"""
100-
Request data from Ohsome API.
101-
"""
133+
"""Request data from Ohsome API."""
102134
url = OHSOME_API_LINK + request["endpoint"]
103135
data = {"bpolys": area, "filter": request["filter"]}
104136
if properties:

0 commit comments

Comments
 (0)