@@ -19,7 +19,7 @@ def id_to_string(x):
1919
2020def get_results_by_project_id (filename , project_id ):
2121 '''
22- Export raw results on project_id basis .
22+ Export results for the given project id .
2323
2424 Parameters
2525 ----------
@@ -54,7 +54,7 @@ def get_results_by_project_id(filename, project_id):
5454
5555def get_tasks_by_project_id (filename , project_id ):
5656 '''
57- Export raw results on project_id basis .
57+ Export tasks for the given project id. Only export if not already downloaded before .
5858
5959 Parameters
6060 ----------
@@ -87,7 +87,7 @@ def get_tasks_by_project_id(filename, project_id):
8787
8888def get_groups_by_project_id (filename , project_id ):
8989 '''
90- Export raw results on project_id basis .
90+ Export groups based on given project id. Only export if not already downloaded before .
9191
9292 Parameters
9393 ----------
@@ -120,6 +120,10 @@ def get_groups_by_project_id(filename, project_id):
120120
121121
122122def calc_agreement (total , no , yes , maybe , bad ):
123+ '''
124+ for each task the "agreement" is computed as defined by Scott's Pi to give a measure for inter-rater reliability
125+ https://en.wikipedia.org/wiki/Scott%27s_Pi
126+ '''
123127
124128 if total == 1 :
125129 agreement = 1
@@ -134,18 +138,39 @@ def calc_agreement(total, no, yes, maybe, bad):
134138
135139
136140def calc_results_progress (number_of_users , number_of_users_required , cum_number_of_users , number_of_tasks , number_of_results ):
141+ '''
142+ for each project the progress is calculated
143+ not all results are considered when calculating the progress
144+ if the required number of users has been reached for a task
145+ all further results will not contribute to increase the progress
146+ '''
137147
138148 if cum_number_of_users <= number_of_users_required :
149+ # this is the simplest case, the number of users is less than the required number of users
150+ # all results contribute to progress
139151 number_of_results_progress = number_of_results
140152 elif (cum_number_of_users - number_of_users ) < number_of_users_required :
153+ # the number of users is bigger than the number of users required
154+ # but the previous number of users was below the required number
155+ # some results contribute to progress
141156 number_of_results_progress = (number_of_users_required - (cum_number_of_users - number_of_users )) * number_of_tasks
142157 else :
158+ # for all other cases: already more users than required
159+ # all results do not contribute to progress
143160 number_of_results_progress = 0
144161
145162 return number_of_results_progress
146163
147164
148165def agg_results_by_task_id (results_df , tasks_df ):
166+ '''
167+ for each task several users contribute results
168+ this functions aggregates using task id
169+ the following values are calculated:
170+ total_count, 0_count, 1_count, 2_count, 3_count
171+ 0_share, 1_share, 2_share, 3_share, agreement
172+ '''
173+
149174
150175 results_by_task_id_df = results_df .groupby (['project_id' , 'group_id' , 'task_id' , 'result' ]).size ().unstack (fill_value = 0 )
151176
@@ -185,6 +210,11 @@ def agg_results_by_task_id(results_df, tasks_df):
185210
186211
187212def get_progress_by_date (results_df , groups_df ):
213+ '''
214+ for each project we retrospectively generate the following attributes for a given date utilizing the results:
215+ number_of_results, cum_number_of_results, progress, cum_progress
216+ '''
217+
188218
189219 groups_df ['required_results' ] = groups_df ['number_of_tasks' ] * groups_df ['number_of_users_required' ]
190220 required_results = groups_df ['required_results' ].sum ()
@@ -220,13 +250,21 @@ def get_progress_by_date(results_df, groups_df):
220250
221251
222252def get_new_user (day , first_day ):
253+ '''
254+ Check if user has contributed results to this project before
255+ '''
256+
223257 if day == first_day :
224258 return 1
225259 else :
226260 return 0
227261
228262
229263def get_contributors_by_date (results_df ):
264+ '''
265+ for each project we retrospectively generate the following attributes for a given date utilizing the results:
266+ number_of_users, number_of_new_users, cum_number_of_users
267+ '''
230268
231269 user_first_day_df = results_df .groupby (['user_id' ]).agg (
232270 first_day = pd .NamedAgg (column = 'day' , aggfunc = 'min' )
@@ -253,6 +291,11 @@ def get_contributors_by_date(results_df):
253291
254292
255293def get_per_project_statistics (project_id ):
294+ '''
295+ the function to calculate all project related statistics
296+ will derive:
297+ results, groups, tasks, agg_results, history
298+ '''
256299
257300 # set filenames
258301 results_filename = f'{ DATA_PATH } /api-data/results/results_{ project_id } .csv'
0 commit comments