99
1010
1111def load_data (project_id : str , csv_file : str ) -> list :
12+ """
13+ This will load the aggregated results csv file into a list of dictionaries.
14+ For further steps we currently rely on task_x, task_y, task_z and yes_share and maybe_share and wkt
15+ """
1216
1317 project_data = []
1418 with open (csv_file , "r" ) as f :
@@ -37,6 +41,10 @@ def load_data(project_id: str, csv_file: str) -> list:
3741 "yes_count" : int (row [3 ]),
3842 "maybe_count" : int (row [4 ]),
3943 "bad_imagery_count" : int (row [5 ]),
44+ "no_share" : float (row [7 ]),
45+ "yes_share" : float (row [8 ]),
46+ "maybe_share" : float (row [9 ]),
47+ "bad_imagery_share" : float (row [10 ]),
4048 "wkt" : tile_functions .geometry_from_tile_coords (
4149 task_x , task_y , task_z
4250 ),
@@ -47,34 +55,35 @@ def load_data(project_id: str, csv_file: str) -> list:
4755
4856
4957def yes_maybe_condition_true (x : dict ) -> bool :
50- # TODO: use no count here as well
51-
52- # TODO: set this to a good number
53- # if x["yes_count"] > 1:
54- if x ["yes_count" ] > 0 :
55- return True
56- elif x ["yes_count" ] > 1 and x ["yes_count" ] >= x ["bad_imagery_count" ]:
57- return True
58- elif x ["maybe_count" ] > 1 and x ["maybe_count" ] >= x ["bad_imagery_count" ]:
59- return True
60- elif (
61- x ["yes_count" ] >= 1
62- and x ["maybe_count" ] >= 1
63- and ((x ["yes_count" ] + x ["maybe_count" ]) >= x ["bad_imagery_count" ])
64- ):
58+ """
59+ The yes maybe condition is true if 35% or
60+ 2 (or more) out of 3 users
61+ 2 (or more) out of 4 users
62+ 2 (or more) out of 5 users
63+ have classified as 'yes' or 'maybe'
64+ """
65+
66+ if x ["yes_share" ] + x ["maybe_share" ] > 0.35 :
6567 return True
6668 else :
6769 return False
6870
6971
70- def filter_data (project_id : str , project_data : list ) -> list :
72+ def filter_data (project_data : list ) -> list :
73+ """
74+ Filter results that fulfil the yes_maybe_condition.
75+ """
7176
7277 # filter yes and maybe
7378 filtered_project_data = [x for x in project_data if yes_maybe_condition_true (x )]
7479 return filtered_project_data
7580
7681
7782def check_list_sum (x , range_val ):
83+ """
84+ This checks if a give tile belongs to the defined "star"-shaped neighbourhood
85+ """
86+
7887 item_sum = abs (x [0 ]) + abs (x [1 ])
7988 if item_sum <= range_val :
8089 return True
@@ -83,6 +92,10 @@ def check_list_sum(x, range_val):
8392
8493
8594def get_neighbour_list (neighbourhood_shape : str , neighbourhood_size : int ) -> list :
95+ """
96+ Filters tiles that are neighbours.
97+ This is based on a given search radius (neighbourhood size) and search window shape (neighbourhood shape=.
98+ """
8699
87100 neighbour_list = []
88101 range_val = int (neighbourhood_size / 2 )
@@ -99,37 +112,38 @@ def get_neighbour_list(neighbourhood_shape: str, neighbourhood_size: int) -> lis
99112 return neighbour_list
100113
101114
102- def check_neighbours (task_x : int , task_y : int , group_id : int ):
103- # TODO: use zoom level from task
115+ def add_group_id_to_neighbours (task_x : int , task_y : int , task_z : int , group_id : int ):
116+ """
117+ Add a group id to all other tiles that are in the neighbourhood of the given tile,
118+ which is defined by task_x, task_y and task_z.
119+ """
104120
105121 # look for neighbours
106- neighbours = []
107122 for i , j in neighbour_list :
108123 new_task_x = int (task_x ) + i
109124 new_task_y = int (task_y ) + j
110- new_task_id = f"18-{ task_x } -{ task_y } " .format (
111- task_x = new_task_x , task_y = new_task_y
112- )
125+ new_task_id = f"{ task_z } -{ new_task_x } -{ new_task_y } "
113126
114127 if new_task_id in yes_results_dict :
115128 yes_results_dict [new_task_id ]["my_group_id" ] = group_id
116- neighbours .append (new_task_id )
117129
118130
119- def create_duplicates_dict ():
131+ def create_duplicates_dict () -> dict :
132+ """
133+ Check which tasks belong to multiple groups.
134+ This will be used as a later stage to put tasks into distinct groups.
135+ """
136+
120137 duplicated_groups = {}
121138 for task_id in yes_results_dict .keys ():
122139 my_group_id = yes_results_dict [task_id ]["my_group_id" ]
123140 # check for other results in the neighbourhood
124- task_x = yes_results_dict [task_id ]["task_x" ]
125- task_y = yes_results_dict [task_id ]["task_y" ]
126-
127141 # look for neighbours
128142 for i , j in neighbour_list :
129- new_task_x = int (task_x ) + i
130- new_task_y = int (task_y ) + j
131- new_task_id = "18-{task_x}-{task_y}" . format (
132- task_x = new_task_x , task_y = new_task_y
143+ new_task_x = int (yes_results_dict [ task_id ][ " task_x" ] ) + i
144+ new_task_y = int (yes_results_dict [ task_id ][ " task_y" ] ) + j
145+ new_task_id = (
146+ f" { yes_results_dict [ task_id ][ 'task_z' ] } - { new_task_x } - { new_task_y } "
133147 )
134148
135149 if new_task_id in yes_results_dict :
@@ -149,11 +163,16 @@ def create_duplicates_dict():
149163 return duplicated_groups
150164
151165
152- def remove_duplicates (duplicated_groups ):
166+ def remove_duplicates (duplicated_groups : dict ):
167+ """
168+ Remove groups ids for tasks which have more than one.
169+ This is to make sure that every task belongs to a single group only.
170+ This distinct group id will be the basis for further geometric processing.
171+ """
172+
153173 for duplicated_group_id in sorted (duplicated_groups .keys (), reverse = True ):
154174 logger .debug (
155- "%s: %s"
156- % (duplicated_group_id , list (duplicated_groups [duplicated_group_id ]))
175+ f"{ duplicated_group_id } : { list (duplicated_groups [duplicated_group_id ])} "
157176 )
158177 my_duplicated_group_id = duplicated_group_id
159178 for other_group_id in duplicated_groups [duplicated_group_id ]:
@@ -166,9 +185,20 @@ def remove_duplicates(duplicated_groups):
166185
167186
168187def split_groups (q ):
188+ """
189+ This function will be executed using threading.
190+ First it checks if there are still processes pending in the queue.
191+ We are using a clustering algorithm to put tasks together in groups.
192+ Since it is computationally expensive to check which tiles are neighbours,
193+ we split our results into chunks (called groups here).
194+ When we reach a group size below the defined group size we will stop.
195+ Otherwise, the group will be split into two parts and
196+ both will be added as new groups to our queue.
197+ """
198+
169199 while not q .empty ():
170200 group_id , group_data , group_size = q .get ()
171- logger .debug ("the group (%s ) has %s members" % ( group_id , len (group_data )) )
201+ logger .debug (f "the group ({ group_id } ) has { len (group_data )} members" )
172202
173203 # find min x, and min y
174204 x_list = []
@@ -244,7 +274,16 @@ def create_hot_tm_tasks(
244274 neighbourhood_shape : str = "rectangle" ,
245275 neighbourhood_size : int = 5 ,
246276) -> dict :
247- # TODO: check input dict structure
277+ """
278+ This functions creates a dictionary of tiles which will be forming a task in the HOT Tasking Manager.
279+ It will create a neighbourhood list, which will function as a mask to filter tiles that are close to each other.
280+ The functions assigns group ids to each tile.
281+ For tiles that got several group ids, this will be resolved in the next step.
282+ Once each task has a unique group id, the function checks the size (number of tiles) for each group.
283+ Groups that hold too many tiles (too big to map in the Tasking Manager) will be split into smaller groups.
284+ Finally, a dictionary is returned which holds each group as an item.
285+ Each group consists of a limited number of tiles.
286+ """
248287
249288 # final groups dict will store the groups that are exported
250289 final_groups_dict = {}
@@ -286,11 +325,13 @@ def create_hot_tm_tasks(
286325 logger .debug ("created new group id" )
287326 logger .debug ("group id: %s" % group_id )
288327
289- # check for other results in the neighbourhood
290- task_x = yes_results_dict [task_id ]["task_x" ]
291- task_y = yes_results_dict [task_id ]["task_y" ]
292-
293- check_neighbours (task_x , task_y , group_id )
328+ # check for other results in the neighbourhood and add the group id to them
329+ add_group_id_to_neighbours (
330+ yes_results_dict [task_id ]["task_x" ],
331+ yes_results_dict [task_id ]["task_y" ],
332+ yes_results_dict [task_id ]["task_z" ],
333+ group_id ,
334+ )
294335
295336 logger .info ("added group ids to yes maybe results dict" )
296337
@@ -352,6 +393,11 @@ def create_hot_tm_tasks(
352393
353394
354395def dissolve_project_data (project_data ):
396+ """
397+ This functions uses the unionCascaded function to return a dissolved MultiPolygon geometry
398+ from several Single Part Polygon geometries.
399+ """
400+
355401 multipolygon_geometry = ogr .Geometry (ogr .wkbMultiPolygon )
356402 for item in project_data :
357403 polygon = ogr .CreateGeometryFromWkt (item ["wkt" ])
@@ -362,6 +408,13 @@ def dissolve_project_data(project_data):
362408
363409
364410def generate_tasking_manager_geometries (project_id : str ):
411+ """
412+ This functions runs the workflow to create a GeoJSON file ready to be used in the HOT Tasking Manager.
413+ First, data is loaded from the aggregated results csv file.
414+ Then it filers results for which a defined threshold of yes and maybe classifications has been reached.
415+ We then derive the Tasking Manager geometries, and a dissolved geometry of all filtered results.
416+ Finally, both data sets are saved into GeoJSON files.
417+ """
365418
366419 raw_data_filename = f"{ DATA_PATH } /api-data/agg_results/agg_results_{ project_id } .csv"
367420 filtered_data_filename = (
@@ -375,18 +428,19 @@ def generate_tasking_manager_geometries(project_id: str):
375428 results = load_data (project_id , raw_data_filename )
376429
377430 # filter yes and maybe results
378- filtered_results = filter_data (project_id , results )
431+ filtered_results = filter_data (results )
379432
380- # dissolve filtered results
381- dissolved_filtered_results = dissolve_project_data (filtered_results )
433+ if len (filtered_results ) > 0 :
434+ # dissolve filtered results
435+ dissolved_filtered_results = dissolve_project_data (filtered_results )
382436
383- # create tasking manager geometries
384- tasking_manager_results = create_hot_tm_tasks (project_id , filtered_results )
437+ # create tasking manager geometries
438+ tasking_manager_results = create_hot_tm_tasks (project_id , filtered_results )
385439
386- # save data as geojson
387- geojson_functions .create_geojson_file (
388- dissolved_filtered_results , filtered_data_filename
389- )
390- geojson_functions .create_geojson_file_from_dict (
391- tasking_manager_results , tasking_manager_data_filename
392- )
440+ # save data as geojson
441+ geojson_functions .create_geojson_file (
442+ dissolved_filtered_results , filtered_data_filename
443+ )
444+ geojson_functions .create_geojson_file_from_dict (
445+ tasking_manager_results , tasking_manager_data_filename
446+ )
0 commit comments