Skip to content

Commit b48f715

Browse files
committed
add option to generate stats for all projects
1 parent d5e6693 commit b48f715

File tree

3 files changed

+78
-51
lines changed

3 files changed

+78
-51
lines changed

mapswipe_workers/mapswipe_workers/generate_stats/generate_stats.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,20 @@
55
from mapswipe_workers.generate_stats import overall_stats
66

77

8-
def generate_stats(project_id_list):
9-
logger.info(f'will generate stats for: {project_id_list}')
8+
def generate_stats(project_id_list, all_projects=False):
109

1110
projects_info_filename = f'{DATA_PATH}/api-data/projects/projects_static.csv'
1211
projects_df = overall_stats.get_project_static_info(projects_info_filename)
1312

1413
projects_info_dynamic_filename = f'{DATA_PATH}/api-data/projects/projects_dynamic.csv'
1514
projects_dynamic_df = overall_stats.load_project_info_dynamic(projects_info_dynamic_filename)
1615

16+
if all_projects:
17+
project_id_list = projects_df['project_id'].to_list()
18+
logger.info(f'will generate stats for all projects.')
19+
20+
logger.info(f'will generate stats for: {project_id_list}')
21+
1722
# get per project stats and aggregate based on task_id
1823
for project_id in project_id_list:
1924

@@ -29,10 +34,11 @@ def generate_stats(project_id_list):
2934

3035
# aggregate results and get per project statistics
3136
project_stats_dict = project_stats.get_per_project_statistics(project_id)
32-
projects_dynamic_df = projects_dynamic_df.append(project_stats_dict, ignore_index=True)
33-
projects_dynamic_df.to_csv(projects_info_dynamic_filename, index_label='idx')
37+
if project_stats_dict:
38+
projects_dynamic_df = projects_dynamic_df.append(project_stats_dict, ignore_index=True)
39+
projects_dynamic_df.to_csv(projects_info_dynamic_filename, index_label='idx')
3440

35-
# TODO: for build area projects generate tasking manager geometries
41+
# TODO: for build area projects generate tasking manager geometries
3642

3743
# merge static info and dynamic info and save
3844
if len(project_id_list) > 0:

mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py

Lines changed: 48 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,18 @@ def get_results_by_project_id(filename, project_id):
3838
logger.info(f'got results from postgres for {project_id}')
3939

4040
df = pd.read_csv(filename)
41-
df['group_id'] = df.apply(lambda row: id_to_string(row['group_id']), axis=1)
42-
df['group_id'] = df['group_id'].astype(str)
43-
df['timestamp'] = pd.to_datetime(df['timestamp'])
44-
df['day'] = df['timestamp'].apply(
45-
lambda df: datetime.datetime(year=df.year, month=df.month, day=df.day))
4641

47-
logger.info(f'created pandas results df for {project_id}')
48-
return df
42+
if len(df) > 0:
43+
df['group_id'] = df.apply(lambda row: id_to_string(row['group_id']), axis=1)
44+
df['group_id'] = df['group_id'].astype(str)
45+
df['timestamp'] = pd.to_datetime(df['timestamp'])
46+
df['day'] = df['timestamp'].apply(
47+
lambda df: datetime.datetime(year=df.year, month=df.month, day=df.day))
48+
logger.info(f'created pandas results df for {project_id}')
49+
return df
50+
else:
51+
logger.info(f'there are no results for this project {project_id}')
52+
return None
4953

5054

5155
def get_tasks_by_project_id(filename, project_id):
@@ -246,34 +250,40 @@ def get_per_project_statistics(project_id):
246250

247251
# load data from postgres or local storage if already downloaded
248252
results_df = get_results_by_project_id(results_filename, project_id)
249-
groups_df = get_groups_by_project_id(groups_filename, project_id)
250-
tasks_df = get_tasks_by_project_id(tasks_filename, project_id)
251-
252-
# aggregate results by task id
253-
agg_results_df = agg_results_by_task_id(results_df, tasks_df)
254-
agg_results_df.to_csv(agg_results_filename, index_label='idx')
255-
logger.info(f'saved agg results for {project_id}: {agg_results_filename}')
256-
geojson_functions.csv_to_geojson(agg_results_filename, 'geom')
257-
258-
# calculate progress by date
259-
progress_by_date_df = get_progress_by_date(results_df, groups_df)
260-
261-
# calculate contributors by date
262-
contributors_by_date_df = get_contributors_by_date(results_df)
263-
264-
# merge contributors and progress
265-
project_stats_by_date_df = progress_by_date_df.merge(contributors_by_date_df, left_on='day', right_on='day')
266-
project_stats_by_date_df['project_id'] = project_id
267-
project_stats_by_date_df.to_csv(project_stats_by_date_filename)
268-
logger.info(f'saved project stats by date for {project_id}: {project_stats_by_date_filename}')
269-
270-
project_stats_dict = {
271-
'project_id': project_id,
272-
'progress': project_stats_by_date_df['cum_progress'].iloc[-1],
273-
'number_of_users': project_stats_by_date_df['cum_number_of_users'].iloc[-1],
274-
'number_of_results': project_stats_by_date_df['cum_number_of_results'].iloc[-1],
275-
'number_of_results_progress': project_stats_by_date_df['cum_number_of_results_progress'].iloc[-1],
276-
'day': project_stats_by_date_df.index[-1]
277-
}
278-
279-
return project_stats_dict
253+
254+
if results_df is None:
255+
logger.info(f'no results: skipping per project stats for {project_id}')
256+
return None
257+
else:
258+
groups_df = get_groups_by_project_id(groups_filename, project_id)
259+
tasks_df = get_tasks_by_project_id(tasks_filename, project_id)
260+
261+
# aggregate results by task id
262+
agg_results_df = agg_results_by_task_id(results_df, tasks_df)
263+
agg_results_df.to_csv(agg_results_filename, index_label='idx')
264+
logger.info(f'saved agg results for {project_id}: {agg_results_filename}')
265+
geojson_functions.csv_to_geojson(agg_results_filename, 'geom')
266+
267+
# calculate progress by date
268+
progress_by_date_df = get_progress_by_date(results_df, groups_df)
269+
270+
# calculate contributors by date
271+
contributors_by_date_df = get_contributors_by_date(results_df)
272+
273+
# merge contributors and progress
274+
project_stats_by_date_df = progress_by_date_df.merge(contributors_by_date_df, left_on='day', right_on='day')
275+
project_stats_by_date_df['project_id'] = project_id
276+
project_stats_by_date_df.to_csv(project_stats_by_date_filename)
277+
logger.info(f'saved project stats by date for {project_id}: {project_stats_by_date_filename}')
278+
279+
project_stats_dict = {
280+
'project_id': project_id,
281+
'progress': project_stats_by_date_df['cum_progress'].iloc[-1],
282+
'number_of_users': project_stats_by_date_df['cum_number_of_users'].iloc[-1],
283+
'number_of_results': project_stats_by_date_df['cum_number_of_results'].iloc[-1],
284+
'number_of_results_progress': project_stats_by_date_df['cum_number_of_results_progress'].iloc[-1],
285+
'day': project_stats_by_date_df.index[-1]
286+
}
287+
288+
return project_stats_dict
289+

mapswipe_workers/mapswipe_workers/mapswipe_workers.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -156,37 +156,48 @@ def run_firebase_to_postgres(schedule):
156156
@click.option(
157157
'--project_id_list',
158158
cls=PythonLiteralOption,
159-
default=[],
159+
default='[]',
160160
help=(
161161
f'provide project id strings as a list '
162162
f'stats will be generated only for this'
163163
f'''use it like '["project_a", "project_b"]' '''
164164
)
165165
)
166-
def run_generate_stats(schedule, project_id_list):
166+
@click.option(
167+
'--all_projects',
168+
default=False,
169+
is_flag=True,
170+
help=(
171+
f'Generate stats for all projects in postgres'
172+
)
173+
)
174+
def run_generate_stats(schedule, project_id_list, all_projects):
167175
sentry.init_sentry()
168176
try:
169177
if schedule:
170178
if schedule == 'm':
171179
sched.every(10).minutes.do(
172180
_run_generate_stats,
173-
project_id_list=project_id_list
181+
project_id_list=project_id_list,
182+
all_projects=all_projects
174183
).run()
175184
while True:
176185
sched.run_pending()
177186
time.sleep(1)
178187
elif schedule == 'h':
179188
sched.every().hour.do(
180189
_run_generate_stats,
181-
project_id_list=project_id_list
190+
project_id_list=project_id_list,
191+
all_projects=all_projects
182192
).run()
183193
while True:
184194
sched.run_pending()
185195
time.sleep(1)
186196
elif schedule == 'd':
187197
sched.every().day.do(
188198
_run_generate_stats,
189-
project_id_list=project_id_list
199+
project_id_list=project_id_list,
200+
all_projects=all_projects
190201
).run()
191202
while True:
192203
sched.run_pending()
@@ -199,7 +210,7 @@ def run_generate_stats(schedule, project_id_list):
199210
f'h for every hour and d for every day.'
200211
)
201212
else:
202-
_run_generate_stats(project_id_list)
213+
_run_generate_stats(project_id_list, all_projects)
203214
except Exception as e:
204215
slack.send_error(e)
205216
sentry.capture_exception_sentry(e)
@@ -421,8 +432,8 @@ def _run_firebase_to_postgres():
421432
return project_id_list
422433

423434

424-
def _run_generate_stats(project_id_list):
425-
generate_stats.generate_stats(project_id_list)
435+
def _run_generate_stats(project_id_list, all_projects):
436+
generate_stats.generate_stats(project_id_list, all_projects)
426437

427438

428439
def _run_user_management(email, manager):

0 commit comments

Comments
 (0)