1+ import ast
12import datetime
23import gzip
34import json
@@ -238,76 +239,45 @@ def get_groups(filename: str, project_id: str) -> pd.DataFrame:
238239 return df
239240
240241
241- def calc_agreement (total : int , no : int , yes : int , maybe : int , bad : int ) -> float :
242+ def calc_agreement (row : pd . Series ) -> float :
242243 """
243- for each task the "agreement" is computed as defined by Scott's Pi
244- Scott's Pi is a measure for inter-rater reliability
245- https://en.wikipedia.org/wiki/Scott%27s_Pi
244+ for each task the "agreement" is computed (i.e. the extent to which
245+ raters agree for the i-th subject). This measure is a component of
246+ Fleiss' kappa: https://en.wikipedia.org/wiki/Fleiss%27_kappa
246247 """
247248
248- # TODO: currently this is implemented only for the 4 given categories
249+ # Calculate total count as the sum of all categories
250+ n = row ["total_count" ]
249251
250- if total == 1 :
251- agreement = 1.0
252+ row = row .drop (labels = ["total_count" ])
253+ # extent to which raters agree for the ith subject
254+ # set agreement to None if only one user contributed
255+ if n == 1 or n == 0 :
256+ agreement = None
252257 else :
253- agreement = (
254- 1.0
255- / (total * (total - 1 ))
256- * (
257- (no * (no - 1 ))
258- + (yes * (yes - 1 ))
259- + (maybe * (maybe - 1 ))
260- + (bad * (bad - 1 ))
261- )
262- )
258+ agreement = (sum ([i ** 2 for i in row ]) - n ) / (n * (n - 1 ))
263259
264260 return agreement
265261
266262
267- def calc_share (total : int , no : int , yes : int , maybe : int , bad : int ) -> List [ float ] :
263+ def calc_share (df : pd . DataFrame ) -> pd . DataFrame :
268264 """Calculate the share of each category on the total count."""
269- no_share = no / total
270- yes_share = yes / total
271- maybe_share = maybe / total
272- bad_share = bad / total
273-
274- return [no_share , yes_share , maybe_share , bad_share ]
275-
276-
277- def calc_count (row ) -> List [int ]:
278- """
279- Check if a count exists for each category ("no", "yes", "maybe", "bad").
280- Then calculate total count as the sum of all categories.
281- """
282-
283- try :
284- no_count = row [0 ]
285- except KeyError :
286- no_count = 0
287-
288- try :
289- yes_count = row [1 ]
290- except KeyError :
291- yes_count = 0
292-
293- try :
294- maybe_count = row [2 ]
295- except KeyError :
296- maybe_count = 0
297-
298- try :
299- bad_count = row [3 ]
300- except KeyError :
301- bad_count = 0
265+ share_df = df .filter (like = "count" ).div (df .total_count , axis = 0 )
266+ share_df .drop ("total_count" , inplace = True , axis = 1 )
267+ share_df .columns = share_df .columns .str .replace ("_count" , "_share" )
268+ return df .join (share_df )
302269
303- total_count = no_count + yes_count + maybe_count + bad_count
304- assert total_count > 0 , "Total count for result must be bigger than zero."
305270
306- return [total_count , no_count , yes_count , maybe_count , bad_count ]
271+ def calc_count (df : pd .DataFrame ) -> pd .DataFrame :
272+ df_new = df .filter (like = "count" )
273+ df_new_sum = df_new .sum (axis = 1 )
274+ return df_new_sum
307275
308276
309- def calc_quadkey (row ):
277+ def calc_quadkey (row : pd . DataFrame ):
310278 """Calculate quadkey based on task id."""
279+ # TODO: This does not make sense for media type, digtitalization.
280+ # For these projects types we should move to project classes.
311281 try :
312282 tile_z , tile_x , tile_y = row ["task_id" ].split ("-" )
313283 quadkey = tile_functions .tile_coords_and_zoom_to_quadKey (
@@ -320,8 +290,26 @@ def calc_quadkey(row):
320290 return quadkey
321291
322292
293+ def add_missing_result_columns (
294+ df : pd .DataFrame ,
295+ custom_options_values : pd .Series
296+ ) -> pd .DataFrame :
297+ """
298+ Check if all possible answers columns are included in the grouped results
299+ data frame and add columns if missing.
300+ """
301+
302+ all_answer_label_values_list = list (
303+ ast .literal_eval (custom_options_values .item ())
304+ )
305+ df = df .reindex (columns = all_answer_label_values_list , fill_value = 0 )
306+ return df
307+
308+
323309def get_agg_results_by_task_id (
324- results_df : pd .DataFrame , tasks_df : pd .DataFrame
310+ results_df : pd .DataFrame ,
311+ tasks_df : pd .DataFrame ,
312+ custom_options_values : pd .Series ,
325313) -> pd .DataFrame :
326314 """
327315 For each task several users contribute results.
@@ -339,6 +327,7 @@ def get_agg_results_by_task_id(
339327 ----------
340328 results_df: pd.DataFrame
341329 tasks_df: pd.DataFrame
330+ custom_options_values: pd.Series
342331 """
343332
344333 results_by_task_id_df = (
@@ -347,23 +336,27 @@ def get_agg_results_by_task_id(
347336 .unstack (fill_value = 0 )
348337 )
349338
350- # calculate total count and check if other counts are defined
351- results_by_task_id_df [["total_count" , 0 , 1 , 2 , 3 ]] = results_by_task_id_df .apply (
352- lambda row : calc_count (row ), axis = 1 , result_type = "expand"
339+ # add columns for answer options that were not chosen for any task
340+ results_by_task_id_df = add_missing_result_columns (
341+ results_by_task_id_df ,
342+ custom_options_values ,
353343 )
354344
345+ # TODO: Add logic for parent values using sub values
346+ # [<parent_value> = <parent_value> + <child_1_value> + .. <child_N_value>]
347+
348+ # needed for ogr2ogr todo: might be legacy?
349+ results_by_task_id_df = results_by_task_id_df .add_suffix ("_count" )
350+
351+ # calculate total count of votes per task
352+ results_by_task_id_df ["total_count" ] = calc_count (results_by_task_id_df )
353+
355354 # calculate share based on counts
356- results_by_task_id_df [
357- ["0_share" , "1_share" , "2_share" , "3_share" ]
358- ] = results_by_task_id_df .apply (
359- lambda row : calc_share (row ["total_count" ], row [0 ], row [1 ], row [2 ], row [3 ]),
360- axis = 1 ,
361- result_type = "expand" ,
362- )
355+ results_by_task_id_df = calc_share (results_by_task_id_df )
363356
364357 # calculate agreement
365358 results_by_task_id_df ["agreement" ] = results_by_task_id_df .apply (
366- lambda row : calc_agreement ( row [ "total_count" ], row [ 0 ], row [ 1 ], row [ 2 ], row [ 3 ]) ,
359+ calc_agreement ,
367360 axis = 1 ,
368361 )
369362 logger .info ("calculated agreement" )
@@ -383,11 +376,6 @@ def get_agg_results_by_task_id(
383376 )
384377 logger .info ("added geometry to aggregated results" )
385378
386- # rename columns, ogr2ogr will fail otherwise
387- agg_results_df .rename (
388- columns = {0 : "0_count" , 1 : "1_count" , 2 : "2_count" , 3 : "3_count" }, inplace = True
389- )
390-
391379 return agg_results_df
392380
393381
@@ -430,7 +418,11 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
430418 add_metadata = False
431419
432420 # aggregate results by task id
433- agg_results_df = get_agg_results_by_task_id (results_df , tasks_df )
421+ agg_results_df = get_agg_results_by_task_id (
422+ results_df ,
423+ tasks_df ,
424+ project_info ["custom_options_values" ],
425+ )
434426 agg_results_df .to_csv (agg_results_filename , index_label = "idx" )
435427
436428 geojson_functions .gzipped_csv_to_gzipped_geojson (
0 commit comments