Skip to content

Commit 9040f94

Browse files
authored
Merge pull request #232 from mapswipe/feature-overall-stats
Feature overall stats
2 parents c338002 + 23223f7 commit 9040f94

File tree

2 files changed

+80
-11
lines changed

2 files changed

+80
-11
lines changed

mapswipe_workers/mapswipe_workers/generate_stats/generate_stats.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ def generate_stats(project_id_list: list):
2020
----------
2121
project_id_list: list
2222
"""
23-
logger.info(f"will generate stats for: {project_id_list}")
2423

24+
logger.info(f"will generate stats for: {project_id_list}")
2525
projects_info_filename = f"{DATA_PATH}/api-data/projects/projects_static.csv"
2626
projects_df = overall_stats.get_project_static_info(projects_info_filename)
2727
project_id_list_postgres = projects_df["project_id"].to_list()
@@ -60,10 +60,14 @@ def generate_stats(project_id_list: list):
6060

6161
# TODO: for build area projects generate tasking manager geometries
6262

63-
# merge static info and dynamic info and save
6463
if len(project_id_list) > 0:
64+
# merge static info and dynamic info and save
6565
projects_filename = f"{DATA_PATH}/api-data/projects/projects.csv"
66-
overall_stats.save_projects(projects_filename, projects_df, projects_dynamic_df)
66+
projects_df = overall_stats.save_projects(projects_filename, projects_df, projects_dynamic_df)
67+
68+
# generate overall stats for active, inactive, finished projects
69+
overall_stats_filename = f"{DATA_PATH}/api-data/stats.csv"
70+
overall_stats.get_overall_stats(projects_df, overall_stats_filename)
6771

6872
logger.info(f"finished generate stats for: {project_id_list}")
6973

mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py

Lines changed: 73 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,57 @@
55
from mapswipe_workers.utils import geojson_functions
66

77

8-
def get_project_static_info(filename):
8+
def get_overall_stats(projects_df: pd.DataFrame, filename: str) -> pd.DataFrame:
9+
"""
10+
The function aggregates the statistics per project using the status attribute.
11+
We derive aggregated statistics for active, inactive and finished projects.
12+
The number of users should not be summed up here, since this would generate wrong results.
13+
A single user can contribute to multiple projects, we need to consider this.
14+
15+
Parameters
16+
----------
17+
projects_df: pd.DataFrame
18+
filename: str
19+
"""
20+
21+
overall_stats_df = projects_df.groupby(['status']).agg(
22+
count_projects=pd.NamedAgg(column='project_id', aggfunc='count'),
23+
area_sqkm=pd.NamedAgg(column='area_sqkm', aggfunc='sum'),
24+
number_of_results=pd.NamedAgg(column='number_of_results', aggfunc='sum'),
25+
number_of_results_progress=pd.NamedAgg(column='number_of_results_progress', aggfunc='sum'),
26+
average_number_of_users_per_project=pd.NamedAgg(column='number_of_users', aggfunc='mean')
27+
)
28+
29+
overall_stats_df.to_csv(filename, index_label="status")
30+
logger.info(f'saved overall stats to {filename}')
31+
32+
return overall_stats_df
33+
34+
35+
def get_project_static_info(filename: str) -> pd.DataFrame:
36+
"""
37+
The function queries the projects table.
38+
Each row represents a single project and provides the information which is static.
39+
By static we understand all attributes which are not affected by new results being contributed.
40+
The results are stored in a csv file and also returned as a pandas DataFrame.
41+
42+
Parameters
43+
----------
44+
filename: str
45+
"""
946

1047
pg_db = auth.postgresDB()
48+
49+
# make sure to replace newline characters here
1150
sql_query = """
1251
COPY (
1352
SELECT
1453
project_id
15-
,name
16-
,project_details
17-
,look_for
54+
,regexp_replace(name, E'[\\n\\r]+', ' ', 'g' ) as name
55+
,regexp_replace(project_details, E'[\\n\\r]+', ' ', 'g' ) as project_details
56+
,regexp_replace(look_for, E'[\\n\\r]+', ' ', 'g' ) as look_for
1857
,project_type
19-
,status
58+
,regexp_replace(status, E'[\\n\\r]+', ' ', 'g' ) as status
2059
,ST_Area(geom::geography)/1000000 as area_sqkm
2160
,ST_AsText(geom) as geom
2261
,ST_AsText(ST_Centroid(geom)) as centroid
@@ -30,10 +69,20 @@ def get_project_static_info(filename):
3069
logger.info("got projects from postgres.")
3170

3271
df = pd.read_csv(filename)
72+
3373
return df
3474

3575

36-
def load_project_info_dynamic(filename):
76+
def load_project_info_dynamic(filename: str) -> pd.DataFrame:
77+
"""
78+
The function loads data from a csv file into a pandas dataframe.
79+
If not file exists, it will be initialized.
80+
81+
Parameters
82+
----------
83+
filename: str
84+
"""
85+
3786
if os.path.isfile(filename):
3887
logger.info(f"file {filename} exists. Init from this file.")
3988
df = pd.read_csv(filename, index_col="idx")
@@ -52,11 +101,27 @@ def load_project_info_dynamic(filename):
52101
return df
53102

54103

55-
def save_projects(filename, df, df_dynamic):
104+
def save_projects(filename: str, df: pd.DataFrame, df_dynamic: pd.DataFrame) -> pd.DataFrame:
105+
"""
106+
The function merges the dataframes for static and dynamic project information
107+
and then save the result as csv file.
108+
Additionally, two geojson files are generated using
109+
(a) the geometry of the projects and
110+
(b) the centroid of the projects.
111+
112+
Parameters
113+
----------
114+
filename: str
115+
df: pd.DataFrame
116+
df_dynamic: pd.DataFrame
117+
"""
118+
56119
projects_df = df.merge(
57120
df_dynamic, left_on="project_id", right_on="project_id", how="left"
58121
)
59-
projects_df.to_csv(filename, index_label="idx")
122+
projects_df.to_csv(filename, index_label="idx", line_terminator='\n')
60123
logger.info(f"saved projects: {filename}")
61124
geojson_functions.csv_to_geojson(filename, "geom")
62125
geojson_functions.csv_to_geojson(filename, "centroid")
126+
127+
return projects_df

0 commit comments

Comments
 (0)