Skip to content

Commit 297409d

Browse files
committed
add description to functions #247
1 parent 3a34b32 commit 297409d

File tree

1 file changed

+107
-53
lines changed

1 file changed

+107
-53
lines changed

mapswipe_workers/mapswipe_workers/generate_stats/tasking_manager_geometries.py

Lines changed: 107 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99

1010

1111
def load_data(project_id: str, csv_file: str) -> list:
12+
"""
13+
This will load the aggregated results csv file into a list of dictionaries.
14+
For further steps we currently rely on task_x, task_y, task_z and yes_share and maybe_share and wkt
15+
"""
1216

1317
project_data = []
1418
with open(csv_file, "r") as f:
@@ -37,6 +41,10 @@ def load_data(project_id: str, csv_file: str) -> list:
3741
"yes_count": int(row[3]),
3842
"maybe_count": int(row[4]),
3943
"bad_imagery_count": int(row[5]),
44+
"no_share": float(row[7]),
45+
"yes_share": float(row[8]),
46+
"maybe_share": float(row[9]),
47+
"bad_imagery_share": float(row[10]),
4048
"wkt": tile_functions.geometry_from_tile_coords(
4149
task_x, task_y, task_z
4250
),
@@ -47,34 +55,35 @@ def load_data(project_id: str, csv_file: str) -> list:
4755

4856

4957
def yes_maybe_condition_true(x: dict) -> bool:
50-
# TODO: use no count here as well
51-
52-
# TODO: set this to a good number
53-
# if x["yes_count"] > 1:
54-
if x["yes_count"] > 0:
55-
return True
56-
elif x["yes_count"] > 1 and x["yes_count"] >= x["bad_imagery_count"]:
57-
return True
58-
elif x["maybe_count"] > 1 and x["maybe_count"] >= x["bad_imagery_count"]:
59-
return True
60-
elif (
61-
x["yes_count"] >= 1
62-
and x["maybe_count"] >= 1
63-
and ((x["yes_count"] + x["maybe_count"]) >= x["bad_imagery_count"])
64-
):
58+
"""
59+
The yes maybe condition is true if 35% or
60+
2 (or more) out of 3 users
61+
2 (or more) out of 4 users
62+
2 (or more) out of 5 users
63+
have classified as 'yes' or 'maybe'
64+
"""
65+
66+
if x["yes_share"] + x["maybe_share"] > 0.35:
6567
return True
6668
else:
6769
return False
6870

6971

70-
def filter_data(project_id: str, project_data: list) -> list:
72+
def filter_data(project_data: list) -> list:
73+
"""
74+
Filter results that fulfil the yes_maybe_condition.
75+
"""
7176

7277
# filter yes and maybe
7378
filtered_project_data = [x for x in project_data if yes_maybe_condition_true(x)]
7479
return filtered_project_data
7580

7681

7782
def check_list_sum(x, range_val):
83+
"""
84+
This checks if a give tile belongs to the defined "star"-shaped neighbourhood
85+
"""
86+
7887
item_sum = abs(x[0]) + abs(x[1])
7988
if item_sum <= range_val:
8089
return True
@@ -83,6 +92,10 @@ def check_list_sum(x, range_val):
8392

8493

8594
def get_neighbour_list(neighbourhood_shape: str, neighbourhood_size: int) -> list:
95+
"""
96+
Filters tiles that are neighbours.
97+
This is based on a given search radius (neighbourhood size) and search window shape (neighbourhood shape=.
98+
"""
8699

87100
neighbour_list = []
88101
range_val = int(neighbourhood_size / 2)
@@ -99,37 +112,38 @@ def get_neighbour_list(neighbourhood_shape: str, neighbourhood_size: int) -> lis
99112
return neighbour_list
100113

101114

102-
def check_neighbours(task_x: int, task_y: int, group_id: int):
103-
# TODO: use zoom level from task
115+
def add_group_id_to_neighbours(task_x: int, task_y: int, task_z: int, group_id: int):
116+
"""
117+
Add a group id to all other tiles that are in the neighbourhood of the given tile,
118+
which is defined by task_x, task_y and task_z.
119+
"""
104120

105121
# look for neighbours
106-
neighbours = []
107122
for i, j in neighbour_list:
108123
new_task_x = int(task_x) + i
109124
new_task_y = int(task_y) + j
110-
new_task_id = f"18-{task_x}-{task_y}".format(
111-
task_x=new_task_x, task_y=new_task_y
112-
)
125+
new_task_id = f"{task_z}-{new_task_x}-{new_task_y}"
113126

114127
if new_task_id in yes_results_dict:
115128
yes_results_dict[new_task_id]["my_group_id"] = group_id
116-
neighbours.append(new_task_id)
117129

118130

119-
def create_duplicates_dict():
131+
def create_duplicates_dict() -> dict:
132+
"""
133+
Check which tasks belong to multiple groups.
134+
This will be used as a later stage to put tasks into distinct groups.
135+
"""
136+
120137
duplicated_groups = {}
121138
for task_id in yes_results_dict.keys():
122139
my_group_id = yes_results_dict[task_id]["my_group_id"]
123140
# check for other results in the neighbourhood
124-
task_x = yes_results_dict[task_id]["task_x"]
125-
task_y = yes_results_dict[task_id]["task_y"]
126-
127141
# look for neighbours
128142
for i, j in neighbour_list:
129-
new_task_x = int(task_x) + i
130-
new_task_y = int(task_y) + j
131-
new_task_id = "18-{task_x}-{task_y}".format(
132-
task_x=new_task_x, task_y=new_task_y
143+
new_task_x = int(yes_results_dict[task_id]["task_x"]) + i
144+
new_task_y = int(yes_results_dict[task_id]["task_y"]) + j
145+
new_task_id = (
146+
f"{yes_results_dict[task_id]['task_z']}-{new_task_x}-{new_task_y}"
133147
)
134148

135149
if new_task_id in yes_results_dict:
@@ -149,11 +163,16 @@ def create_duplicates_dict():
149163
return duplicated_groups
150164

151165

152-
def remove_duplicates(duplicated_groups):
166+
def remove_duplicates(duplicated_groups: dict):
167+
"""
168+
Remove groups ids for tasks which have more than one.
169+
This is to make sure that every task belongs to a single group only.
170+
This distinct group id will be the basis for further geometric processing.
171+
"""
172+
153173
for duplicated_group_id in sorted(duplicated_groups.keys(), reverse=True):
154174
logger.debug(
155-
"%s: %s"
156-
% (duplicated_group_id, list(duplicated_groups[duplicated_group_id]))
175+
f"{duplicated_group_id}: {list(duplicated_groups[duplicated_group_id])}"
157176
)
158177
my_duplicated_group_id = duplicated_group_id
159178
for other_group_id in duplicated_groups[duplicated_group_id]:
@@ -166,9 +185,20 @@ def remove_duplicates(duplicated_groups):
166185

167186

168187
def split_groups(q):
188+
"""
189+
This function will be executed using threading.
190+
First it checks if there are still processes pending in the queue.
191+
We are using a clustering algorithm to put tasks together in groups.
192+
Since it is computationally expensive to check which tiles are neighbours,
193+
we split our results into chunks (called groups here).
194+
When we reach a group size below the defined group size we will stop.
195+
Otherwise, the group will be split into two parts and
196+
both will be added as new groups to our queue.
197+
"""
198+
169199
while not q.empty():
170200
group_id, group_data, group_size = q.get()
171-
logger.debug("the group (%s) has %s members" % (group_id, len(group_data)))
201+
logger.debug(f"the group ({group_id}) has {len(group_data)} members")
172202

173203
# find min x, and min y
174204
x_list = []
@@ -244,7 +274,16 @@ def create_hot_tm_tasks(
244274
neighbourhood_shape: str = "rectangle",
245275
neighbourhood_size: int = 5,
246276
) -> dict:
247-
# TODO: check input dict structure
277+
"""
278+
This functions creates a dictionary of tiles which will be forming a task in the HOT Tasking Manager.
279+
It will create a neighbourhood list, which will function as a mask to filter tiles that are close to each other.
280+
The functions assigns group ids to each tile.
281+
For tiles that got several group ids, this will be resolved in the next step.
282+
Once each task has a unique group id, the function checks the size (number of tiles) for each group.
283+
Groups that hold too many tiles (too big to map in the Tasking Manager) will be split into smaller groups.
284+
Finally, a dictionary is returned which holds each group as an item.
285+
Each group consists of a limited number of tiles.
286+
"""
248287

249288
# final groups dict will store the groups that are exported
250289
final_groups_dict = {}
@@ -286,11 +325,13 @@ def create_hot_tm_tasks(
286325
logger.debug("created new group id")
287326
logger.debug("group id: %s" % group_id)
288327

289-
# check for other results in the neighbourhood
290-
task_x = yes_results_dict[task_id]["task_x"]
291-
task_y = yes_results_dict[task_id]["task_y"]
292-
293-
check_neighbours(task_x, task_y, group_id)
328+
# check for other results in the neighbourhood and add the group id to them
329+
add_group_id_to_neighbours(
330+
yes_results_dict[task_id]["task_x"],
331+
yes_results_dict[task_id]["task_y"],
332+
yes_results_dict[task_id]["task_z"],
333+
group_id,
334+
)
294335

295336
logger.info("added group ids to yes maybe results dict")
296337

@@ -352,6 +393,11 @@ def create_hot_tm_tasks(
352393

353394

354395
def dissolve_project_data(project_data):
396+
"""
397+
This functions uses the unionCascaded function to return a dissolved MultiPolygon geometry
398+
from several Single Part Polygon geometries.
399+
"""
400+
355401
multipolygon_geometry = ogr.Geometry(ogr.wkbMultiPolygon)
356402
for item in project_data:
357403
polygon = ogr.CreateGeometryFromWkt(item["wkt"])
@@ -362,6 +408,13 @@ def dissolve_project_data(project_data):
362408

363409

364410
def generate_tasking_manager_geometries(project_id: str):
411+
"""
412+
This functions runs the workflow to create a GeoJSON file ready to be used in the HOT Tasking Manager.
413+
First, data is loaded from the aggregated results csv file.
414+
Then it filers results for which a defined threshold of yes and maybe classifications has been reached.
415+
We then derive the Tasking Manager geometries, and a dissolved geometry of all filtered results.
416+
Finally, both data sets are saved into GeoJSON files.
417+
"""
365418

366419
raw_data_filename = f"{DATA_PATH}/api-data/agg_results/agg_results_{project_id}.csv"
367420
filtered_data_filename = (
@@ -375,18 +428,19 @@ def generate_tasking_manager_geometries(project_id: str):
375428
results = load_data(project_id, raw_data_filename)
376429

377430
# filter yes and maybe results
378-
filtered_results = filter_data(project_id, results)
431+
filtered_results = filter_data(results)
379432

380-
# dissolve filtered results
381-
dissolved_filtered_results = dissolve_project_data(filtered_results)
433+
if len(filtered_results) > 0:
434+
# dissolve filtered results
435+
dissolved_filtered_results = dissolve_project_data(filtered_results)
382436

383-
# create tasking manager geometries
384-
tasking_manager_results = create_hot_tm_tasks(project_id, filtered_results)
437+
# create tasking manager geometries
438+
tasking_manager_results = create_hot_tm_tasks(project_id, filtered_results)
385439

386-
# save data as geojson
387-
geojson_functions.create_geojson_file(
388-
dissolved_filtered_results, filtered_data_filename
389-
)
390-
geojson_functions.create_geojson_file_from_dict(
391-
tasking_manager_results, tasking_manager_data_filename
392-
)
440+
# save data as geojson
441+
geojson_functions.create_geojson_file(
442+
dissolved_filtered_results, filtered_data_filename
443+
)
444+
geojson_functions.create_geojson_file_from_dict(
445+
tasking_manager_results, tasking_manager_data_filename
446+
)

0 commit comments

Comments
 (0)