11import datetime
22import os
33from typing import List
4+ import gzip
5+ import io
46
57import pandas as pd
68from psycopg2 import sql
@@ -31,10 +33,18 @@ def write_sql_to_csv(filename: str, sql_query: sql.SQL):
3133 Use the copy statement to write data from postgres to a csv file.
3234 """
3335
36+ temp_file = "temp.csv"
3437 pg_db = auth .postgresDB ()
35- with open (filename , "w" ) as f :
38+ with open (temp_file , "w" ) as f :
3639 pg_db .copy_expert (sql_query , f )
37- logger .info (f"wrote csv file from sql: { filename } " )
40+
41+ with open (temp_file , 'rb' ) as f_in , gzip .open (filename , 'wb' ) as f_out :
42+ f_out .writelines (f_in )
43+
44+ # remove temp file
45+ os .remove (temp_file )
46+
47+ logger .info (f"wrote gzipped csv file from sql: { filename } " )
3848
3949
4050def load_df_from_csv (filename : str ) -> pd .DataFrame :
@@ -44,7 +54,11 @@ def load_df_from_csv(filename: str) -> pd.DataFrame:
4454 """
4555 dtype_dict = {"project_id" : str , "group_id" : str , "task_id" : str }
4656
47- df = pd .read_csv (filename , dtype = dtype_dict )
57+ df = pd .read_csv (
58+ filename ,
59+ dtype = dtype_dict ,
60+ compression = "gzip"
61+ )
4862 logger .info (f"loaded pandas df from { filename } " )
4963 return df
5064
@@ -322,11 +336,11 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
322336 """
323337
324338 # set filenames
325- results_filename = f"{ DATA_PATH } /api/results/results_{ project_id } .csv"
326- tasks_filename = f"{ DATA_PATH } /api/tasks/tasks_{ project_id } .csv"
327- groups_filename = f"{ DATA_PATH } /api/groups/groups_{ project_id } .csv"
328- agg_results_filename = f"{ DATA_PATH } /api/agg_results/agg_results_{ project_id } .csv"
329- agg_results_by_user_id_filename = f"{ DATA_PATH } /api/users/users_{ project_id } .csv"
339+ results_filename = f"{ DATA_PATH } /api/results/results_{ project_id } .csv.gz "
340+ tasks_filename = f"{ DATA_PATH } /api/tasks/tasks_{ project_id } .csv.gz "
341+ groups_filename = f"{ DATA_PATH } /api/groups/groups_{ project_id } .csv.gz "
342+ agg_results_filename = f"{ DATA_PATH } /api/agg_results/agg_results_{ project_id } .csv.gz "
343+ agg_results_by_user_id_filename = f"{ DATA_PATH } /api/users/users_{ project_id } .csv.gz "
330344 project_stats_by_date_filename = f"{ DATA_PATH } /api/history/history_{ project_id } .csv"
331345
332346 # load data from postgres or local storage if already downloaded
@@ -339,11 +353,23 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
339353 groups_df = get_groups (groups_filename , project_id )
340354 tasks_df = get_tasks (tasks_filename , project_id )
341355
356+ if any ("maxar" in s for s in project_info ["tile_server_names" ]):
357+ add_metadata = True
358+
342359 # aggregate results by task id
343360 agg_results_df = get_agg_results_by_task_id (results_df , tasks_df )
344- agg_results_df .to_csv (agg_results_filename , index_label = "idx" )
361+ agg_results_df .to_csv (
362+ agg_results_filename ,
363+ index_label = "idx" ,
364+ compression = "gzip"
365+ )
366+
367+ geojson_functions .gzipped_csv_to_gzipped_geojson (
368+ filename = agg_results_filename ,
369+ geometry_field = "geom" ,
370+ add_metadata = add_metadata
371+ )
345372 logger .info (f"saved agg results for { project_id } : { agg_results_filename } " )
346- geojson_functions .csv_to_geojson (agg_results_filename , "geom" )
347373
348374 # aggregate results by user id
349375 # TODO: solve memory issue for agg results by user id
@@ -361,10 +387,6 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
361387 sentry .capture_exception ()
362388 logger .info (f"failed to agg results by user id for { project_id } " )
363389
364- if any ("maxar" in s for s in project_info ["tile_server_names" ]):
365- add_metadata_to_csv (agg_results_filename )
366- geojson_functions .add_metadata_to_geojson (agg_results_filename )
367-
368390 project_stats_by_date_df = project_stats_by_date .get_project_history (
369391 results_df , groups_df
370392 )
@@ -380,7 +402,10 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
380402 # do not do this for ArbitraryGeometry / BuildingFootprint projects
381403 logger .info (f"do NOT generate tasking manager geometries for { project_id } " )
382404 else :
383- tasking_manager_geometries .generate_tasking_manager_geometries (project_id )
405+ tasking_manager_geometries .generate_tasking_manager_geometries (
406+ project_id = project_id ,
407+ agg_results_filename = agg_results_filename
408+ )
384409
385410 # prepare output of function
386411 project_stats_dict = {
0 commit comments