1313from operators import get_datetime_from_filename
1414from arcgis .features import FeatureLayerCollection , FeatureSet , Table , Feature
1515from validator import ValidationError
16+ from agol_connection import AGOLConnection
1617
1718
1819def load_csv_to_df (csv_file_path ):
@@ -62,16 +63,18 @@ def create_summary_table_row(df, source_data_timestamp, source_filename):
6263
6364class Ingester (object ):
6465
65- def __init__ (self , dry_run = False ):
66+ def __init__ (self , dry_run = False , verbose = False ):
6667
6768 creds = self ._load_credentials ()
6869 if creds is None :
6970 raise Exception ("no sftp credentials supplied" )
7071
7172 self .creds = creds
7273 self .dry_run = dry_run
73- self .gis = None
74+ agol_connection = AGOLConnection ()
75+ self .agol = agol_connection
7476 self .available_files = []
77+ self .verbose = verbose
7578
7679 def _load_credentials (self ):
7780
@@ -89,12 +92,8 @@ def _load_credentials(self):
8992 creds ['host' ] = row ['host' ]
9093 return creds
9194
92- def set_gis (self , gis_object ):
93-
94- self .gis = gis_object
95-
9695 def get_files_from_sftp (self , prefix = "HOS_ResourceCapacity_" , target_dir = "/tmp" ,
97- only_latest = True , filenames_to_ignore = []):
96+ only_latest = True , filenames_to_ignore = [], verbose = False ):
9897
9998 cnopts = pysftp .CnOpts ()
10099 cnopts .hostkeys .load ('copaftp.pub' )
@@ -121,19 +120,27 @@ def get_files_from_sftp(self, prefix="HOS_ResourceCapacity_", target_dir="/tmp",
121120 files_to_get = files
122121 for f in files_to_get :
123122 if f in filenames_to_ignore :
124- print (f"Ignoring { f } " )
123+ if self .verbose :
124+ print (f"Ignoring { f } " )
125125 continue
126- print (f"Getting: { f } " )
126+ if self .verbose :
127+ print (f"Getting: { f } " )
127128 if os .path .join (target_dir , f ) not in existing_files :
128129 sftp .get (f , f'{ target_dir } /{ f } ' )
129- print (f"Finished downloading { target_dir } /{ f } " )
130+ if self .verbose :
131+ print (f"Finished downloading { target_dir } /{ f } " )
130132 else :
131- print (f"Didn't have to download { target_dir } /{ f } ; it already exists" )
133+ if self .verbose :
134+ print (f"Didn't have to download { target_dir } /{ f } ; it already exists" )
132135
133136 source_date = get_datetime_from_filename (f , prefix = prefix )
134137 file_details .append ({"dir" : target_dir , "filename" : f , "source_datetime" : source_date })
135138 return (file_details , files )
136139
140+ def get_already_processed_files (self , dataset_name ):
141+
142+ return self .agol .get_already_processed_files (dataset_name )
143+
137144 def process_hospital (self , processed_dir , processed_filename , public = True ):
138145
139146 # public vs. non-public means different ArcGIS online items
@@ -142,32 +149,38 @@ def process_hospital(self, processed_dir, processed_filename, public=True):
142149 else :
143150 dataset_name = "hospital_layer"
144151
145- print (f"Starting load of hospital data: { dataset_name } " )
152+ if self .verbose :
153+ print (f"Starting load of hospital data: { dataset_name } " )
146154
147- status = self .gis .overwrite_arcgis_layer (dataset_name , processed_dir , processed_filename , dry_run = self .dry_run )
155+ status = self .agol .overwrite_arcgis_layer (dataset_name , processed_dir , processed_filename , dry_run = self .dry_run )
148156
149- print (status )
150- print (f"Finished load of hospital data: { dataset_name } " )
157+ if self .verbose :
158+ print (status )
159+ print (f"Finished load of hospital data: { dataset_name } " )
151160 return processed_dir , processed_filename
152161
153162 def process_supplies (self , processed_dir , processed_filename ):
154- print ("Starting load of supplies data" )
163+ if self .verbose :
164+ print ("Starting load of supplies data" )
155165
156166 # set the new file name using the original file name in the layers conf
157- supplies_filename = self .gis .layers ['supplies' ]['original_file_name' ]
167+ supplies_filename = self .agol .layers ['supplies' ]['original_file_name' ]
158168
159169 df = load_csv_to_df (os .path .join (processed_dir , processed_filename ))
160170 supplies = create_supplies_table (df )
161171
162172 supplies .to_csv (os .path .join (processed_dir , supplies_filename ), index = False )
163173
164- status = self .gis .overwrite_arcgis_layer ("supplies" , processed_dir , supplies_filename , dry_run = self .dry_run )
165- print (status )
166- print ("Finished load of supplies data" )
174+ status = self .agol .overwrite_arcgis_layer ("supplies" , processed_dir , supplies_filename , dry_run = self .dry_run )
175+
176+ if self .verbose :
177+ print (status )
178+ print ("Finished load of supplies data" )
167179
168180 def process_county_summaries (self , processed_dir , processed_filename ):
169181
170- print ("Starting load of county summary table..." )
182+ if self .verbose :
183+ print ("Starting load of county summary table..." )
171184
172185 new_data_filename = "new_county_summary_table.csv"
173186
@@ -190,14 +203,18 @@ def process_county_summaries(self, processed_dir, processed_filename):
190203
191204 d2 .to_csv (os .path .join (processed_dir , new_data_filename ), header = True , index = False )
192205
193- status = self .gis .overwrite_arcgis_layer ("county_summaries" , processed_dir , new_data_filename , dry_run = self .dry_run )
194- print (status )
195- print ("Finished load of county summary data" )
206+ status = self .agol .overwrite_arcgis_layer ("county_summaries" , processed_dir , new_data_filename , dry_run = self .dry_run )
207+
208+ if self .verbose :
209+ print (status )
210+ print ("Finished load of county summary data" )
196211
197212 def process_summaries (self , processed_dir , processed_file_details , make_historical_csv = False ):
198- print ("Starting load of summary table..." )
199213
200- summary_filename = self .gis .layers ['summary_table' ]['original_file_name' ]
214+ if self .verbose :
215+ print ("Starting load of summary table..." )
216+
217+ summary_filename = self .agol .layers ['summary_table' ]['original_file_name' ]
201218
202219 summary_df = pd .DataFrame ()
203220 for f in processed_file_details :
@@ -213,14 +230,15 @@ def process_summaries(self, processed_dir, processed_file_details, make_historic
213230 if make_historical_csv :
214231 out_csv_file = os .path .join (processed_dir , summary_filename )
215232 summary_df .to_csv (out_csv_file , index = False , header = True )
216- print ("Finished creation of historical summary table CSV, returning." )
233+ if self .verbose :
234+ print ("Finished creation of historical summary table CSV, returning." )
217235 return
218236
219- layer_conf = self .gis .layers ['summary_table' ]
237+ layer_conf = self .agol .layers ['summary_table' ]
220238
221239 # this self.gis.gis.content pattern is evidence that the first pass at
222240 # a refactored structure should not be the last...
223- table = self .gis .gis .content .get (layer_conf ['id' ])
241+ table = self .agol .gis .content .get (layer_conf ['id' ])
224242 t = table .tables [0 ]
225243
226244 new_col_names = {}
@@ -237,64 +255,47 @@ def process_summaries(self, processed_dir, processed_file_details, make_historic
237255 # but it won't stop the processing.
238256 fs = FeatureSet (features )
239257 if self .dry_run :
240- print ( "Dry run set, not editing features." )
241- status = "Dry run"
258+ if self . verbose :
259+ print ( "Dry run set, not editing features." )
242260 else :
243261 status = t .edit_features (adds = features )
244- print (status )
245- print ("Finished load of summary table" )
262+ if self .verbose :
263+ print (status )
264+ if self .verbose :
265+ print ("Finished load of summary table" )
246266
247267
248268 def process_historical_hos (self , processed_dir , processed_file_details , make_historical_csv = False ):
249269
250- print ("Starting load of historical HOS table..." )
270+ if self .verbose :
271+ print ("Starting load of historical HOS table..." )
251272
252- layer_conf = self .gis .layers ['full_historical_table' ]
273+ layer_conf = self .agol .layers ['full_historical_table' ]
253274 original_data_file_name = layer_conf ['original_file_name' ]
254275
255- table = self .gis .gis .content .get (layer_conf ['id' ])
276+ table = self .agol .gis .content .get (layer_conf ['id' ])
256277 t = table .layers [0 ]
257278
258279 # get short field names that are in use online to test the input csv headers
259- agol_fields = {n ["alias" ]: n ["name" ] for n in t .properties .fields }
280+ # not used now but retained in case of future needs
281+ # agol_fields = {n["alias"]: n["name"] for n in t.properties.fields}
260282
261283 # iterate all csvs and collect the information from each one.
262284 # normalize header names at the same time
263285 hist_csv_rows = []
264- mapping = hm .HeaderMapping ("HOS" )
265- alias_lookup = mapping .get_alias_lookup () # used to convert historical names (pre-5/12)
266- valid_fieldnames = mapping .get_fieldnames () # valid short names (post-5/12)
267286 for f in processed_file_details :
268287 fname = f ["processed_filename" ]
269288 size = os .path .getsize (os .path .join (processed_dir , fname ))
270289 if size > 0 :
271- processed_time = datetime .utcnow ().isoformat ()
290+ processed_time = datetime .utcnow ().isoformat ()
272291 with open (os .path .join (processed_dir , fname ), newline = '' ) as csvfile :
273292 reader = csv .DictReader (csvfile )
274293 for row in reader :
275294
276- out_row = {}
277-
278- for col_name , value in row .items ():
279- # first test if col_name in new valid headers
280- # if so, use the col_name and value directly
281- if col_name in valid_fieldnames :
282- out_row [col_name ] = value
283- # next test if col_name in lookup of old aliases
284- # if so, convert the col_name
285- elif col_name in alias_lookup :
286- out_row [alias_lookup [col_name ]] = value
287- print (f"Found a long name in { fname } : { col_name } " )
288- # finally, raise exception if the field can't be matched
289- else :
290- msg = f"{ fname } : Can't match field '{ col_name } '"
291- raise ValidationError (msg )
292-
293- out_row ["Source Data Timestamp" ] = f ["source_datetime" ].isoformat ()
294- out_row ["Processed At" ] = processed_time
295- out_row ["Source Filename" ] = f ["filename" ]
296-
297- hist_csv_rows .append (out_row )
295+ row ["Source Data Timestamp" ] = f ["source_datetime" ].isoformat ()
296+ row ["Processed At" ] = processed_time
297+ row ["Source Filename" ] = f ["filename" ]
298+ hist_csv_rows .append (row )
298299
299300 else :
300301 print (f"{ fname } has a filesize of { size } , not processing." )
@@ -312,28 +313,31 @@ def process_historical_hos(self, processed_dir, processed_file_details, make_hi
312313 features = [Feature (attributes = row ) for row in hist_csv_rows ]
313314 fs = FeatureSet (features )
314315 if self .dry_run :
315- print ("Dry run set, not editing features." )
316+ if self .verbose :
317+ print ("Dry run set, not editing features." )
316318 else :
317319 fc = len (features )
318320 chunksize = 1000.0
319321 feature_batchs = chunks (features , math .ceil (fc / chunksize ))
320322 fb_list = list (feature_batchs )
321323 fbc = len (fb_list )
322- print (f"Adding { fc } features to the historical table in { fbc } batches." )
324+ if self .verbose :
325+ print (f"Adding { fc } features to the historical table in { fbc } batches." )
323326 for batch in fb_list :
324327 status = t .edit_features (adds = batch )
325328 print (status )
326- print ("Finished load of historical HOS table" )
329+ if self .verbose :
330+ print ("Finished load of historical HOS table" )
327331
328332 def process_daily_hospital_averages (self , historical_gis_item_id , daily_averages_item_id ):
329333 # see what days have been processed
330- # if not processed,
334+ # if not processed,
331335 # get the historical table
332336 # turn it into a df
333337 # per day, get the averages
334338 # for new: days
335339 print ("XXX daily_hospital_averages stub, returning." )
336- table = self .gis .content .get (historical_gis_item_id )
340+ table = self .agol . gis .content .get (historical_gis_item_id )
337341 t = table .layers [0 ]
338342
339343
0 commit comments