Skip to content

Commit 009903e

Browse files
committed
work on PR comments #218
1 parent deadd48 commit 009903e

File tree

8 files changed

+811
-584
lines changed

8 files changed

+811
-584
lines changed

mapswipe_workers/mapswipe_workers/firebase_to_postgres/update_data.py

Lines changed: 39 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,27 @@
55

66

77
def update_user_data(user_ids=None):
8-
'''
8+
"""
99
Copies new users from Firebase to Postgres
10-
'''
10+
"""
1111
# TODO: On Conflict
1212
fb_db = auth.firebaseDB()
1313
pg_db = auth.postgresDB()
1414

15-
fb_ref = fb_db.reference('v2/users')
15+
fb_ref = fb_db.reference("v2/users")
1616

17-
pg_query = '''
17+
pg_query = """
1818
SELECT created
1919
FROM users
2020
ORDER BY created DESC
2121
LIMIT 1
22-
'''
22+
"""
2323
last_updated = pg_db.retr_query(pg_query)
2424
try:
2525
last_updated = last_updated[0][0]
26-
logger.info(f'got last updated timestamp: {last_updated}')
26+
logger.info(f"got last updated timestamp: {last_updated}")
2727
except:
28-
logger.info('could not get last timestamp')
28+
logger.info("could not get last timestamp")
2929
last_updated = None
3030

3131
if last_updated is None:
@@ -34,13 +34,15 @@ def update_user_data(user_ids=None):
3434
users = fb_ref.get()
3535
else:
3636
# Get only new users from Firebase.
37-
last_updated = last_updated.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
38-
fb_query = fb_ref.order_by_child('created').start_at(last_updated)
37+
last_updated = last_updated.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
38+
fb_query = fb_ref.order_by_child("created").start_at(last_updated)
3939
users = fb_query.get()
4040
# Delete first user in ordered dict.
4141
# This user is already in the database (user.created = last_updated).
4242
if len(users) == 0:
43-
logger.info(f"there are no new users in firebase based on created timestamp")
43+
logger.info(
44+
f"there are no new users in firebase based on created timestamp"
45+
)
4446
else:
4547
users.popitem(last=False)
4648

@@ -57,40 +59,43 @@ def update_user_data(user_ids=None):
5759
# Convert timestamp (ISO 8601) from string to a datetime object.
5860
try:
5961
created = dt.datetime.strptime(
60-
user['created'].replace('Z', ''),
61-
'%Y-%m-%dT%H:%M:%S.%f'
62-
)
62+
user["created"].replace("Z", ""), "%Y-%m-%dT%H:%M:%S.%f"
63+
)
6364
except KeyError:
6465
# if user has no "created" attribute, we set it to current time
65-
created = dt.datetime.utcnow().isoformat()[0:-3]+'Z'
66-
logger.info(f"user {user_id} didn't have a created attribute set it to {created}")
66+
created = dt.datetime.utcnow().isoformat()[0:-3] + "Z"
67+
logger.info(
68+
f"user {user_id} didn't have a 'created' attribute. Set it to '{created}' now."
69+
)
6770

6871
try:
69-
username = user['username']
72+
username = user["username"]
7073
except KeyError:
7174
# if user has no "username" attribute, we set it to None
7275
username = None
73-
logger.info(f"user {user_id} didn't have a username attribute set it to {username}")
76+
logger.info(
77+
f"user {user_id} didn't have a 'username' attribute. Set it to '{username}' now."
78+
)
7479

75-
query_update_user = '''
80+
query_update_user = """
7681
INSERT INTO users (user_id, username, created)
7782
VALUES(%s, %s, %s)
7883
ON CONFLICT (user_id) DO UPDATE
7984
SET username=%s,
8085
created=%s;
81-
'''
86+
"""
8287
data_update_user = [
83-
user_id,
84-
username,
85-
created,
86-
username,
87-
created,
88-
]
88+
user_id,
89+
username,
90+
created,
91+
username,
92+
created,
93+
]
8994
pg_db.query(query_update_user, data_update_user)
9095

91-
del(pg_db)
96+
del pg_db
9297

93-
logger.info('Updated user data in Potgres')
98+
logger.info("Updated user data in Potgres")
9499

95100

96101
def update_project_data(project_ids=None):
@@ -113,26 +118,26 @@ def update_project_data(project_ids=None):
113118
logger.info(f"update project data in postgres for selected projects")
114119
projects = dict()
115120
for project_id in project_ids:
116-
project_ref = fb_db.reference(f'v2/projects/{project_id}')
121+
project_ref = fb_db.reference(f"v2/projects/{project_id}")
117122
projects[project_id] = project_ref.get()
118123
else:
119124
logger.info(f"update project data in postgres for all firebase projects")
120-
projects_ref = fb_db.reference('v2/projects/')
125+
projects_ref = fb_db.reference("v2/projects/")
121126
projects = projects_ref.get()
122127

123128
if projects:
124129
for project_id, project in projects.items():
125-
query_update_project = '''
130+
query_update_project = """
126131
UPDATE projects
127132
SET status=%s
128133
WHERE project_id=%s;
129-
'''
134+
"""
130135
# TODO: Is there need for fallback to ''
131136
# if project.status is not existent
132-
data_update_project = [project.get('status', ''), project_id]
137+
data_update_project = [project.get("status", ""), project_id]
133138
pg_db.query(query_update_project, data_update_project)
134139
logger.info(f"updated status for project {project_id} in postgres")
135140

136-
del(pg_db)
141+
del pg_db
137142

138-
logger.info('Updated project data in Postgres')
143+
logger.info("Updated project data in Postgres")
Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,84 @@
11
from mapswipe_workers.definitions import logger
22
from mapswipe_workers.definitions import DATA_PATH
3+
from mapswipe_workers.generate_stats import project_stats, overall_stats
34

4-
from mapswipe_workers.generate_stats import project_stats
5-
from mapswipe_workers.generate_stats import overall_stats
65

6+
def generate_stats(project_id_list: list):
7+
"""
8+
Query attributes for all projects from postgres projects table
9+
Write information on status (e.g. active, inactive, finished) and further attributes
10+
for all projects to projects_static.csv.
11+
Computationally more expensive tasks are only performed for projects specified in project_id_list.
12+
Write information on progress and contributors and further attributes
13+
only for projects specified in project_id_list to projects_dynamic.csv.
14+
Write information on project progress history and aggregated results
15+
only for projects specified in project_id_list to csv and geojson files.
16+
Merge projects_static.csv and projects_dynamic.csv into projects.csv.
17+
Convert projects.csv file into GeoJSON format using project geometry and project centroid.
718
8-
def generate_stats(project_id_list, all_projects=False):
19+
Parameters
20+
----------
21+
project_id_list: list
22+
"""
23+
logger.info(f"will generate stats for: {project_id_list}")
924

10-
projects_info_filename = f'{DATA_PATH}/api-data/projects/projects_static.csv'
25+
projects_info_filename = f"{DATA_PATH}/api-data/projects/projects_static.csv"
1126
projects_df = overall_stats.get_project_static_info(projects_info_filename)
27+
project_id_list_postgres = projects_df["project_id"].to_list()
1228

13-
projects_info_dynamic_filename = f'{DATA_PATH}/api-data/projects/projects_dynamic.csv'
14-
projects_dynamic_df = overall_stats.load_project_info_dynamic(projects_info_dynamic_filename)
15-
16-
if all_projects:
17-
project_id_list = projects_df['project_id'].to_list()
18-
logger.info(f'will generate stats for all projects.')
19-
20-
logger.info(f'will generate stats for: {project_id_list}')
29+
projects_info_dynamic_filename = (
30+
f"{DATA_PATH}/api-data/projects/projects_dynamic.csv"
31+
)
32+
projects_dynamic_df = overall_stats.load_project_info_dynamic(
33+
projects_info_dynamic_filename
34+
)
2135

2236
# get per project stats and aggregate based on task_id
2337
for project_id in project_id_list:
2438

25-
# check if project id is correct
26-
if not project_id in projects_df['project_id'].to_list():
27-
logger.info(f'project {project_id} does not exist. skip this one.')
39+
# check if project id is existing
40+
if project_id not in project_id_list_postgres:
41+
logger.info(f"project {project_id} does not exist. skip this one.")
2842
continue
2943

30-
logger.info(f'start generate stats for project: {project_id}')
31-
idx = projects_dynamic_df.index[projects_dynamic_df['project_id'] == project_id].tolist()
44+
logger.info(f"start generate stats for project: {project_id}")
45+
idx = projects_dynamic_df.index[
46+
projects_dynamic_df["project_id"] == project_id
47+
].tolist()
3248
if len(idx) > 0:
3349
projects_dynamic_df.drop([idx[0]], inplace=True)
3450

3551
# aggregate results and get per project statistics
3652
project_stats_dict = project_stats.get_per_project_statistics(project_id)
3753
if project_stats_dict:
38-
projects_dynamic_df = projects_dynamic_df.append(project_stats_dict, ignore_index=True)
39-
projects_dynamic_df.to_csv(projects_info_dynamic_filename, index_label='idx')
54+
projects_dynamic_df = projects_dynamic_df.append(
55+
project_stats_dict, ignore_index=True
56+
)
57+
projects_dynamic_df.to_csv(
58+
projects_info_dynamic_filename, index_label="idx"
59+
)
4060

4161
# TODO: for build area projects generate tasking manager geometries
4262

4363
# merge static info and dynamic info and save
4464
if len(project_id_list) > 0:
45-
projects_filename = f'{DATA_PATH}/api-data/projects/projects.csv'
46-
overall_stats.save_projects(projects_filename, projects_df, projects_dynamic_df)
65+
projects_filename = f"{DATA_PATH}/api-data/projects/projects.csv"
66+
overall_stats.save_projects(projects_filename, projects_df, projects_dynamic_df)
67+
68+
69+
def generate_stats_all_projects():
70+
"""
71+
queries all existing project ids from postgres projects table
72+
saves them into a csv file and returns a list of all project ids
73+
then generates project statistics using the derived list of project ids
74+
"""
75+
76+
logger.info(f"will generate stats for all projects.")
77+
78+
# get all project ids from postgres database
79+
projects_info_filename = f"{DATA_PATH}/api-data/projects/projects_static.csv"
80+
projects_df = overall_stats.get_project_static_info(projects_info_filename)
81+
project_id_list = projects_df["project_id"].to_list()
82+
83+
# generate stats for the derived project ids
84+
generate_stats(project_id_list)

mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,31 +23,40 @@ def get_project_static_info(filename):
2323
FROM projects
2424
) TO STDOUT WITH CSV HEADER"""
2525

26-
with open(filename, 'w') as f:
26+
with open(filename, "w") as f:
2727
pg_db.copy_expert(sql_query, f)
2828

2929
del pg_db
30-
logger.info('got projects from postgres.')
30+
logger.info("got projects from postgres.")
3131

3232
df = pd.read_csv(filename)
3333
return df
3434

3535

3636
def load_project_info_dynamic(filename):
3737
if os.path.isfile(filename):
38-
logger.info(f'file {filename} exists. Init from this file.')
39-
df = pd.read_csv(filename, index_col='idx')
38+
logger.info(f"file {filename} exists. Init from this file.")
39+
df = pd.read_csv(filename, index_col="idx")
4040
else:
41-
columns = ['project_id', 'progress', 'number_of_users', 'number_of_results', 'number_of_results_progress', 'day']
41+
columns = [
42+
"project_id",
43+
"progress",
44+
"number_of_users",
45+
"number_of_results",
46+
"number_of_results_progress",
47+
"day",
48+
]
4249
df = pd.DataFrame(index=[], columns=columns)
43-
df['project_id'].astype('str')
50+
df["project_id"].astype("str")
4451

4552
return df
4653

4754

4855
def save_projects(filename, df, df_dynamic):
49-
projects_df = df.merge(df_dynamic, left_on='project_id', right_on='project_id', how='left')
50-
projects_df.to_csv(filename, index_label='idx')
51-
logger.info(f'saved projects: {filename}')
52-
geojson_functions.csv_to_geojson(filename, 'geom')
53-
geojson_functions.csv_to_geojson(filename, 'centroid')
56+
projects_df = df.merge(
57+
df_dynamic, left_on="project_id", right_on="project_id", how="left"
58+
)
59+
projects_df.to_csv(filename, index_label="idx")
60+
logger.info(f"saved projects: {filename}")
61+
geojson_functions.csv_to_geojson(filename, "geom")
62+
geojson_functions.csv_to_geojson(filename, "centroid")

0 commit comments

Comments
 (0)