Skip to content

Commit deadd48

Browse files
committed
add comments and doc
1 parent c6d5ebe commit deadd48

File tree

1 file changed

+46
-3
lines changed

1 file changed

+46
-3
lines changed

mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def id_to_string(x):
1919

2020
def get_results_by_project_id(filename, project_id):
2121
'''
22-
Export raw results on project_id basis.
22+
Export results for the given project id.
2323
2424
Parameters
2525
----------
@@ -54,7 +54,7 @@ def get_results_by_project_id(filename, project_id):
5454

5555
def get_tasks_by_project_id(filename, project_id):
5656
'''
57-
Export raw results on project_id basis.
57+
Export tasks for the given project id. Only export if not already downloaded before.
5858
5959
Parameters
6060
----------
@@ -87,7 +87,7 @@ def get_tasks_by_project_id(filename, project_id):
8787

8888
def get_groups_by_project_id(filename, project_id):
8989
'''
90-
Export raw results on project_id basis.
90+
Export groups based on given project id. Only export if not already downloaded before.
9191
9292
Parameters
9393
----------
@@ -120,6 +120,10 @@ def get_groups_by_project_id(filename, project_id):
120120

121121

122122
def calc_agreement(total, no, yes, maybe, bad):
123+
'''
124+
for each task the "agreement" is computed as defined by Scott's Pi to give a measure for inter-rater reliability
125+
https://en.wikipedia.org/wiki/Scott%27s_Pi
126+
'''
123127

124128
if total == 1:
125129
agreement = 1
@@ -134,18 +138,39 @@ def calc_agreement(total, no, yes, maybe, bad):
134138

135139

136140
def calc_results_progress(number_of_users, number_of_users_required, cum_number_of_users, number_of_tasks, number_of_results):
141+
'''
142+
for each project the progress is calculated
143+
not all results are considered when calculating the progress
144+
if the required number of users has been reached for a task
145+
all further results will not contribute to increase the progress
146+
'''
137147

138148
if cum_number_of_users <= number_of_users_required:
149+
# this is the simplest case, the number of users is less than the required number of users
150+
# all results contribute to progress
139151
number_of_results_progress = number_of_results
140152
elif (cum_number_of_users - number_of_users) < number_of_users_required:
153+
# the number of users is bigger than the number of users required
154+
# but the previous number of users was below the required number
155+
# some results contribute to progress
141156
number_of_results_progress = (number_of_users_required - (cum_number_of_users - number_of_users)) * number_of_tasks
142157
else:
158+
# for all other cases: already more users than required
159+
# all results do not contribute to progress
143160
number_of_results_progress = 0
144161

145162
return number_of_results_progress
146163

147164

148165
def agg_results_by_task_id(results_df, tasks_df):
166+
'''
167+
for each task several users contribute results
168+
this functions aggregates using task id
169+
the following values are calculated:
170+
total_count, 0_count, 1_count, 2_count, 3_count
171+
0_share, 1_share, 2_share, 3_share, agreement
172+
'''
173+
149174

150175
results_by_task_id_df = results_df.groupby(['project_id', 'group_id', 'task_id', 'result']).size().unstack(fill_value=0)
151176

@@ -185,6 +210,11 @@ def agg_results_by_task_id(results_df, tasks_df):
185210

186211

187212
def get_progress_by_date(results_df, groups_df):
213+
'''
214+
for each project we retrospectively generate the following attributes for a given date utilizing the results:
215+
number_of_results, cum_number_of_results, progress, cum_progress
216+
'''
217+
188218

189219
groups_df['required_results'] = groups_df['number_of_tasks'] * groups_df['number_of_users_required']
190220
required_results = groups_df['required_results'].sum()
@@ -220,13 +250,21 @@ def get_progress_by_date(results_df, groups_df):
220250

221251

222252
def get_new_user(day, first_day):
253+
'''
254+
Check if user has contributed results to this project before
255+
'''
256+
223257
if day == first_day:
224258
return 1
225259
else:
226260
return 0
227261

228262

229263
def get_contributors_by_date(results_df):
264+
'''
265+
for each project we retrospectively generate the following attributes for a given date utilizing the results:
266+
number_of_users, number_of_new_users, cum_number_of_users
267+
'''
230268

231269
user_first_day_df = results_df.groupby(['user_id']).agg(
232270
first_day=pd.NamedAgg(column='day', aggfunc='min')
@@ -253,6 +291,11 @@ def get_contributors_by_date(results_df):
253291

254292

255293
def get_per_project_statistics(project_id):
294+
'''
295+
the function to calculate all project related statistics
296+
will derive:
297+
results, groups, tasks, agg_results, history
298+
'''
256299

257300
# set filenames
258301
results_filename = f'{DATA_PATH}/api-data/results/results_{project_id}.csv'

0 commit comments

Comments
 (0)