Skip to content

Commit ce67205

Browse files
committed
use projects df to derive overall stats
1 parent f7827a1 commit ce67205

File tree

2 files changed

+25
-46
lines changed

2 files changed

+25
-46
lines changed

mapswipe_workers/mapswipe_workers/generate_stats/generate_stats.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,6 @@ def generate_stats(project_id_list: list):
2121
project_id_list: list
2222
"""
2323

24-
logger.info('will generate overall stats')
25-
overall_stats_filename = f"{DATA_PATH}/api-data/stats.csv"
26-
overall_stats.get_overall_stats(overall_stats_filename)
27-
2824
logger.info(f"will generate stats for: {project_id_list}")
2925
projects_info_filename = f"{DATA_PATH}/api-data/projects/projects_static.csv"
3026
projects_df = overall_stats.get_project_static_info(projects_info_filename)
@@ -64,10 +60,14 @@ def generate_stats(project_id_list: list):
6460

6561
# TODO: for build area projects generate tasking manager geometries
6662

67-
# merge static info and dynamic info and save
6863
if len(project_id_list) > 0:
64+
# merge static info and dynamic info and save
6965
projects_filename = f"{DATA_PATH}/api-data/projects/projects.csv"
70-
overall_stats.save_projects(projects_filename, projects_df, projects_dynamic_df)
66+
projects_df = overall_stats.save_projects(projects_filename, projects_df, projects_dynamic_df)
67+
68+
# generate overall stats for active, inactive, finished projects
69+
overall_stats_filename = f"{DATA_PATH}/api-data/stats.csv"
70+
overall_stats.get_overall_stats(projects_df, overall_stats_filename)
7171

7272
logger.info(f"finished generate stats for: {project_id_list}")
7373

mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py

Lines changed: 19 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5,54 +5,31 @@
55
from mapswipe_workers.utils import geojson_functions
66

77

8-
def get_overall_stats(filename: str) -> pd.DataFrame:
8+
def get_overall_stats(projects_df: pd.DataFrame, filename: str) -> pd.DataFrame:
99
"""
10-
The function queries the projects table and row_counts table in postgres.
11-
The query results are stored in the specified csv file.
12-
And also returned as a pandas DataFrame.
10+
The function aggregates the statistics per project using the status attribute.
11+
We derive aggregated statistics for active, inactive and finished projects.
12+
The number of users should not be summed up here, since this would generate wrong results.
13+
A single user can contribute to multiple projects, we need to consider this.
1314
1415
Parameters
1516
----------
17+
projects_df: pd.DataFrame
1618
filename: str
1719
"""
1820

19-
pg_db = auth.postgresDB()
20-
sql_query = """
21-
COPY (
22-
SELECT
23-
'all' as status
24-
,count(*) as count_projects
25-
,round(SUM(ST_Area(geom::geography)/(1000*1000))::numeric, 1) as area_projects_sqkm
26-
,(SELECT reltuples FROM row_counts WHERE relname = 'groups') as count_groups
27-
,(SELECT reltuples FROM row_counts WHERE relname = 'tasks') as count_tasks
28-
,(SELECT reltuples FROM row_counts WHERE relname = 'results') as count_results
29-
,(SELECT count(*) FROM users) as count_users
30-
,clock_timestamp()
31-
FROM projects
32-
UNION
33-
SELECT
34-
status
35-
,count(*) as count_projects
36-
,round(SUM(ST_Area(geom::geography)/(1000*1000))::numeric, 1) as area_sqkm
37-
,NULL
38-
,NULL
39-
,NULL
40-
,NULL
41-
,clock_timestamp()
42-
FROM projects
43-
GROUP BY
44-
status
45-
ORDER BY count_tasks
46-
) TO STDOUT WITH CSV HEADER"""
47-
48-
with open(filename, "w") as f:
49-
pg_db.copy_expert(sql_query, f)
21+
overall_stats_df = projects_df.groupby(['status']).agg(
22+
count_projects=pd.NamedAgg(column='project_id', aggfunc='count'),
23+
area_sqkm=pd.NamedAgg(column='area_sqkm', aggfunc='sum'),
24+
number_of_results=pd.NamedAgg(column='number_of_results', aggfunc='sum'),
25+
number_of_results_progress=pd.NamedAgg(column='number_of_results_progress', aggfunc='sum'),
26+
average_number_of_users_per_project=pd.NamedAgg(column='number_of_users', aggfunc='mean')
27+
)
5028

51-
del pg_db
52-
logger.info("got overall stats from postgres.")
29+
overall_stats_df.to_csv(filename, index_label="status")
30+
logger.info(f'saved overall stats to {filename}')
5331

54-
df = pd.read_csv(filename)
55-
return df
32+
return overall_stats_df
5633

5734

5835
def get_project_static_info(filename: str) -> pd.DataFrame:
@@ -121,7 +98,7 @@ def load_project_info_dynamic(filename: str) -> pd.DataFrame:
12198
return df
12299

123100

124-
def save_projects(filename: str, df: pd.DataFrame, df_dynamic: pd.DataFrame) -> None:
101+
def save_projects(filename: str, df: pd.DataFrame, df_dynamic: pd.DataFrame) -> pd.DataFrame:
125102
"""
126103
The function merges the dataframes for static and dynamic project information
127104
and then save the result as csv file.
@@ -143,3 +120,5 @@ def save_projects(filename: str, df: pd.DataFrame, df_dynamic: pd.DataFrame) ->
143120
logger.info(f"saved projects: {filename}")
144121
geojson_functions.csv_to_geojson(filename, "geom")
145122
geojson_functions.csv_to_geojson(filename, "centroid")
123+
124+
return projects_df

0 commit comments

Comments
 (0)