1+ import ast
12import datetime
23import gzip
34import json
45import os
56import tempfile
6- from typing import List
7+ import typing
78
89import pandas as pd
910from pandas .api .types import is_numeric_dtype
@@ -238,76 +239,57 @@ def get_groups(filename: str, project_id: str) -> pd.DataFrame:
238239 return df
239240
240241
241- def calc_agreement (total : int , no : int , yes : int , maybe : int , bad : int ) -> float :
242+ def calc_agreement (row : pd . Series ) -> float :
242243 """
243- for each task the "agreement" is computed as defined by Scott's Pi
244- Scott's Pi is a measure for inter-rater reliability
245- https://en.wikipedia.org/wiki/Scott%27s_Pi
244+ for each task the "agreement" is computed (i.e. the extent to which
245+ raters agree for the i-th subject). This measure is a component of
246+ Fleiss' kappa: https://en.wikipedia.org/wiki/Fleiss%27_kappa
246247 """
247248
248- # TODO: currently this is implemented only for the 4 given categories
249+ # Calculate total count as the sum of all categories
250+ n = row ["total_count" ]
249251
250- if total == 1 :
251- agreement = 1.0
252+ row = row .drop (labels = ["total_count" ])
253+ # extent to which raters agree for the ith subject
254+ # set agreement to None if only one user contributed
255+ if n == 1 or n == 0 :
256+ agreement = None
252257 else :
253- agreement = (
254- 1.0
255- / (total * (total - 1 ))
256- * (
257- (no * (no - 1 ))
258- + (yes * (yes - 1 ))
259- + (maybe * (maybe - 1 ))
260- + (bad * (bad - 1 ))
261- )
262- )
258+ agreement = (sum ([i ** 2 for i in row ]) - n ) / (n * (n - 1 ))
263259
264260 return agreement
265261
266262
267- def calc_share (total : int , no : int , yes : int , maybe : int , bad : int ) -> List [ float ] :
263+ def calc_share (df : pd . DataFrame ) -> pd . DataFrame :
268264 """Calculate the share of each category on the total count."""
269- no_share = no / total
270- yes_share = yes / total
271- maybe_share = maybe / total
272- bad_share = bad / total
273-
274- return [no_share , yes_share , maybe_share , bad_share ]
275-
276-
277- def calc_count (row ) -> List [int ]:
278- """
279- Check if a count exists for each category ("no", "yes", "maybe", "bad").
280- Then calculate total count as the sum of all categories.
281- """
282-
283- try :
284- no_count = row [0 ]
285- except KeyError :
286- no_count = 0
265+ share_df = df .filter (like = "count" ).div (df .total_count , axis = 0 )
266+ share_df .drop ("total_count" , inplace = True , axis = 1 )
267+ share_df .columns = share_df .columns .str .replace ("_count" , "_share" )
268+ return df .join (share_df )
287269
288- try :
289- yes_count = row [1 ]
290- except KeyError :
291- yes_count = 0
292-
293- try :
294- maybe_count = row [2 ]
295- except KeyError :
296- maybe_count = 0
297270
298- try :
299- bad_count = row [3 ]
300- except KeyError :
301- bad_count = 0
271+ def calc_parent_option_count (
272+ df : pd .DataFrame ,
273+ custom_options : typing .Dict [int , typing .Set [int ]],
274+ ) -> pd .DataFrame :
275+ df_new = df .copy ()
276+ # Update option count using sub options count
277+ for option , sub_options in custom_options .items ():
278+ for sub_option in sub_options :
279+ df_new [f"{ option } _count" ] += df_new [f"{ sub_option } _count" ]
280+ return df_new
302281
303- total_count = no_count + yes_count + maybe_count + bad_count
304- assert total_count > 0 , "Total count for result must be bigger than zero."
305282
306- return [total_count , no_count , yes_count , maybe_count , bad_count ]
283+ def calc_count (df : pd .DataFrame ) -> pd .DataFrame :
284+ df_new = df .filter (like = "count" )
285+ df_new_sum = df_new .sum (axis = 1 )
286+ return df_new_sum
307287
308288
309- def calc_quadkey (row ):
289+ def calc_quadkey (row : pd . DataFrame ):
310290 """Calculate quadkey based on task id."""
291+ # TODO: This does not make sense for media type, digtitalization.
292+ # For these projects types we should move to project classes.
311293 try :
312294 tile_z , tile_x , tile_y = row ["task_id" ].split ("-" )
313295 quadkey = tile_functions .tile_coords_and_zoom_to_quadKey (
@@ -320,8 +302,42 @@ def calc_quadkey(row):
320302 return quadkey
321303
322304
305+ def get_custom_options (custom_options : pd .Series ) -> typing .Dict [int , typing .Set [int ]]:
306+ eval_value = ast .literal_eval (custom_options .item ())
307+ return {
308+ option ["value" ]: {
309+ sub_option ["value" ] for sub_option in option .get ("subOptions" , [])
310+ }
311+ for option in eval_value
312+ }
313+
314+
315+ def add_missing_result_columns (
316+ df : typing .Union [pd .DataFrame , pd .Series ],
317+ custom_options : typing .Dict [int , typing .Set [int ]],
318+ ) -> pd .DataFrame :
319+ """
320+ Check if all possible answers columns are included in the grouped results
321+ data frame and add columns if missing.
322+ """
323+
324+ all_answer_label_values_set = set (
325+ [
326+ _option
327+ for option , sub_options in custom_options .items ()
328+ for _option in [option , * sub_options ]
329+ ]
330+ )
331+ return df .reindex (
332+ columns = sorted (all_answer_label_values_set ),
333+ fill_value = 0 ,
334+ )
335+
336+
323337def get_agg_results_by_task_id (
324- results_df : pd .DataFrame , tasks_df : pd .DataFrame
338+ results_df : pd .DataFrame ,
339+ tasks_df : pd .DataFrame ,
340+ custom_options_raw : pd .Series ,
325341) -> pd .DataFrame :
326342 """
327343 For each task several users contribute results.
@@ -339,6 +355,7 @@ def get_agg_results_by_task_id(
339355 ----------
340356 results_df: pd.DataFrame
341357 tasks_df: pd.DataFrame
358+ custom_options_raw: pd.Series
342359 """
343360
344361 results_by_task_id_df = (
@@ -347,23 +364,31 @@ def get_agg_results_by_task_id(
347364 .unstack (fill_value = 0 )
348365 )
349366
350- # calculate total count and check if other counts are defined
351- results_by_task_id_df [["total_count" , 0 , 1 , 2 , 3 ]] = results_by_task_id_df .apply (
352- lambda row : calc_count (row ), axis = 1 , result_type = "expand"
367+ custom_options = get_custom_options (custom_options_raw )
368+
369+ # add columns for answer options that were not chosen for any task
370+ results_by_task_id_df = add_missing_result_columns (
371+ results_by_task_id_df ,
372+ custom_options ,
353373 )
354374
355- # calculate share based on counts
356- results_by_task_id_df [
357- ["0_share" , "1_share" , "2_share" , "3_share" ]
358- ] = results_by_task_id_df .apply (
359- lambda row : calc_share (row ["total_count" ], row [0 ], row [1 ], row [2 ], row [3 ]),
360- axis = 1 ,
361- result_type = "expand" ,
375+ # needed for ogr2ogr todo: might be legacy?
376+ results_by_task_id_df = results_by_task_id_df .add_suffix ("_count" )
377+
378+ # calculate total count of votes per task
379+ results_by_task_id_df ["total_count" ] = calc_count (results_by_task_id_df )
380+
381+ results_by_task_id_df = calc_parent_option_count (
382+ results_by_task_id_df ,
383+ custom_options ,
362384 )
363385
386+ # calculate share based on counts
387+ results_by_task_id_df = calc_share (results_by_task_id_df )
388+
364389 # calculate agreement
365390 results_by_task_id_df ["agreement" ] = results_by_task_id_df .apply (
366- lambda row : calc_agreement ( row [ "total_count" ], row [ 0 ], row [ 1 ], row [ 2 ], row [ 3 ]) ,
391+ calc_agreement ,
367392 axis = 1 ,
368393 )
369394 logger .info ("calculated agreement" )
@@ -383,11 +408,6 @@ def get_agg_results_by_task_id(
383408 )
384409 logger .info ("added geometry to aggregated results" )
385410
386- # rename columns, ogr2ogr will fail otherwise
387- agg_results_df .rename (
388- columns = {0 : "0_count" , 1 : "1_count" , 2 : "2_count" , 3 : "3_count" }, inplace = True
389- )
390-
391411 return agg_results_df
392412
393413
@@ -430,7 +450,11 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
430450 add_metadata = False
431451
432452 # aggregate results by task id
433- agg_results_df = get_agg_results_by_task_id (results_df , tasks_df )
453+ agg_results_df = get_agg_results_by_task_id (
454+ results_df ,
455+ tasks_df ,
456+ project_info ["custom_options" ],
457+ )
434458 agg_results_df .to_csv (agg_results_filename , index_label = "idx" )
435459
436460 geojson_functions .gzipped_csv_to_gzipped_geojson (
0 commit comments