Skip to content

Commit bf1197f

Browse files
authored
Merge pull request #13 from RTCovid/improve_processing_speed
Improve processing speed
2 parents a6d501c + e2691cb commit bf1197f

File tree

5 files changed

+196
-135
lines changed

5 files changed

+196
-135
lines changed

agol_connection.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,16 @@
99

1010
class AGOLConnection(object):
1111

12-
def __init__(self):
12+
def __init__(self, verbose=False):
1313

1414
creds = self._load_credentials()
1515
if creds is None:
1616
raise Exception("no arcgis credentials supplied")
1717

1818
self.creds = creds
1919
self.layers = self._get_layers()
20-
self.gis = self._get_gis()
20+
self.gis = self._make_connection()
21+
self.verbose = verbose
2122

2223
def _load_credentials(self):
2324

@@ -42,7 +43,7 @@ def _get_layers(self):
4243

4344
return configs
4445

45-
def _get_gis(self):
46+
def _make_connection(self):
4647
username = self.creds['username']
4748
password = self.creds['password']
4849
host = self.creds['host']
@@ -64,21 +65,25 @@ def get_arcgis_feature_collection_from_item_id(self, arcgis_item_id):
6465

6566
def overwrite_arcgis_layer(self, dataset_name, source_data_dir, source_data_file, dry_run=False):
6667

67-
print(f"Begin upload to ArcGIS Online")
68-
if dry_run is True:
69-
print("** DRY RUN -- NO UPLOAD WILL HAPPEN **")
68+
if self.verbose:
69+
print(f"Begin upload to ArcGIS Online")
70+
if dry_run is True:
71+
print("** DRY RUN -- NO UPLOAD WILL HAPPEN **")
7072

7173
try:
7274
layer_config = self.layers[dataset_name]
7375
except KeyError:
74-
print(f"Invalid dataset name: {dataset_name}. Valid options are {list(self.layers.keys())}. Alter agol_layers.json to add more.")
76+
if self.verbose:
77+
print(f"Invalid dataset name: {dataset_name}. Valid options are"
78+
" {list(self.layers.keys())}. Alter agol_layers.json to add more.")
7579
return False
7680

7781
original_file_name = layer_config['original_file_name']
7882
item_id = layer_config['id']
7983

80-
print(f" ArcGIS Online item id: {layer_config['id']}")
81-
print(f" CSV name used for upload: {layer_config['original_file_name']}")
84+
if self.verbose:
85+
print(f" ArcGIS Online item id: {layer_config['id']}")
86+
print(f" CSV name used for upload: {layer_config['original_file_name']}")
8287

8388
fs = self.get_arcgis_feature_collection_from_item_id(item_id)
8489
# Overwrite docs:
@@ -96,19 +101,24 @@ def overwrite_arcgis_layer(self, dataset_name, source_data_dir, source_data_file
96101
shutil.copyfile(os.path.join(source_data_dir, source_data_file),
97102
os.path.join(tmpdirname, original_file_name))
98103

99-
print(f" local CSV file name: {source_data_dir}/{source_data_file}")
104+
if self.verbose:
105+
print(f" local CSV file name: {source_data_dir}/{source_data_file}")
100106
original_dir = os.getcwd()
101107
os.chdir(tmpdirname)
102108
if dry_run is False:
103109
try:
104-
print(" starting upload...")
110+
if self.verbose:
111+
print(" starting upload...")
105112
result = fs.manager.overwrite(original_file_name)
106113
except Exception as e:
107-
print(f"Caught exception {e} during upload, retrying")
114+
if self.verbose:
115+
print(f"Caught exception {e} during upload, retrying")
108116
result = fs.manager.overwrite(original_file_name)
109-
print(" finished.")
117+
if self.verbose:
118+
print(" finished.")
110119
else:
111-
result = "Dry run complete"
120+
if self.verbose:
121+
result = "Dry run complete"
112122
os.chdir(original_dir)
113123
return result
114124

header_mapping.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,19 @@ def get_fieldnames_and_aliases(self):
5151

5252
return self.get_fieldnames() + self.get_aliases()
5353

54+
def get_master_lookup(self):
55+
"""This lookup has keys for all long names AND all short names. Each key
56+
corresponds to the proper short name. Allows a single point of entry for
57+
any header name."""
58+
59+
lookup = {}
60+
for k, v in self.mapping.items():
61+
lookup[k] = k
62+
for alias in v:
63+
lookup[alias] = k
64+
65+
return lookup
66+
5467

5568
ltc_mapping = {}
5669

ingester.py

Lines changed: 72 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from operators import get_datetime_from_filename
1414
from arcgis.features import FeatureLayerCollection, FeatureSet, Table, Feature
1515
from validator import ValidationError
16+
from agol_connection import AGOLConnection
1617

1718

1819
def load_csv_to_df(csv_file_path):
@@ -62,16 +63,18 @@ def create_summary_table_row(df, source_data_timestamp, source_filename):
6263

6364
class Ingester(object):
6465

65-
def __init__(self, dry_run=False):
66+
def __init__(self, dry_run=False, verbose=False):
6667

6768
creds = self._load_credentials()
6869
if creds is None:
6970
raise Exception("no sftp credentials supplied")
7071

7172
self.creds = creds
7273
self.dry_run = dry_run
73-
self.gis = None
74+
agol_connection = AGOLConnection()
75+
self.agol = agol_connection
7476
self.available_files = []
77+
self.verbose = verbose
7578

7679
def _load_credentials(self):
7780

@@ -89,12 +92,8 @@ def _load_credentials(self):
8992
creds['host'] = row['host']
9093
return creds
9194

92-
def set_gis(self, gis_object):
93-
94-
self.gis = gis_object
95-
9695
def get_files_from_sftp(self, prefix="HOS_ResourceCapacity_", target_dir="/tmp",
97-
only_latest=True, filenames_to_ignore=[]):
96+
only_latest=True, filenames_to_ignore=[], verbose=False):
9897

9998
cnopts = pysftp.CnOpts()
10099
cnopts.hostkeys.load('copaftp.pub')
@@ -121,19 +120,27 @@ def get_files_from_sftp(self, prefix="HOS_ResourceCapacity_", target_dir="/tmp",
121120
files_to_get = files
122121
for f in files_to_get:
123122
if f in filenames_to_ignore:
124-
print(f"Ignoring {f}")
123+
if self.verbose:
124+
print(f"Ignoring {f}")
125125
continue
126-
print(f"Getting: {f}")
126+
if self.verbose:
127+
print(f"Getting: {f}")
127128
if os.path.join(target_dir, f) not in existing_files:
128129
sftp.get(f, f'{target_dir}/{f}')
129-
print(f"Finished downloading {target_dir}/{f}")
130+
if self.verbose:
131+
print(f"Finished downloading {target_dir}/{f}")
130132
else:
131-
print(f"Didn't have to download {target_dir}/{f}; it already exists")
133+
if self.verbose:
134+
print(f"Didn't have to download {target_dir}/{f}; it already exists")
132135

133136
source_date = get_datetime_from_filename(f, prefix=prefix)
134137
file_details.append({"dir": target_dir, "filename": f, "source_datetime": source_date})
135138
return (file_details, files)
136139

140+
def get_already_processed_files(self, dataset_name):
141+
142+
return self.agol.get_already_processed_files(dataset_name)
143+
137144
def process_hospital(self, processed_dir, processed_filename, public=True):
138145

139146
# public vs. non-public means different ArcGIS online items
@@ -142,32 +149,38 @@ def process_hospital(self, processed_dir, processed_filename, public=True):
142149
else:
143150
dataset_name = "hospital_layer"
144151

145-
print(f"Starting load of hospital data: {dataset_name}")
152+
if self.verbose:
153+
print(f"Starting load of hospital data: {dataset_name}")
146154

147-
status = self.gis.overwrite_arcgis_layer(dataset_name, processed_dir, processed_filename, dry_run=self.dry_run)
155+
status = self.agol.overwrite_arcgis_layer(dataset_name, processed_dir, processed_filename, dry_run=self.dry_run)
148156

149-
print(status)
150-
print(f"Finished load of hospital data: {dataset_name}")
157+
if self.verbose:
158+
print(status)
159+
print(f"Finished load of hospital data: {dataset_name}")
151160
return processed_dir, processed_filename
152161

153162
def process_supplies(self, processed_dir, processed_filename):
154-
print("Starting load of supplies data")
163+
if self.verbose:
164+
print("Starting load of supplies data")
155165

156166
# set the new file name using the original file name in the layers conf
157-
supplies_filename = self.gis.layers['supplies']['original_file_name']
167+
supplies_filename = self.agol.layers['supplies']['original_file_name']
158168

159169
df = load_csv_to_df(os.path.join(processed_dir, processed_filename))
160170
supplies = create_supplies_table(df)
161171

162172
supplies.to_csv(os.path.join(processed_dir, supplies_filename), index=False)
163173

164-
status = self.gis.overwrite_arcgis_layer("supplies", processed_dir, supplies_filename, dry_run=self.dry_run)
165-
print(status)
166-
print("Finished load of supplies data")
174+
status = self.agol.overwrite_arcgis_layer("supplies", processed_dir, supplies_filename, dry_run=self.dry_run)
175+
176+
if self.verbose:
177+
print(status)
178+
print("Finished load of supplies data")
167179

168180
def process_county_summaries(self, processed_dir, processed_filename):
169181

170-
print("Starting load of county summary table...")
182+
if self.verbose:
183+
print("Starting load of county summary table...")
171184

172185
new_data_filename = "new_county_summary_table.csv"
173186

@@ -190,14 +203,18 @@ def process_county_summaries(self, processed_dir, processed_filename):
190203

191204
d2.to_csv(os.path.join(processed_dir, new_data_filename), header=True, index=False)
192205

193-
status = self.gis.overwrite_arcgis_layer("county_summaries", processed_dir, new_data_filename, dry_run=self.dry_run)
194-
print(status)
195-
print("Finished load of county summary data")
206+
status = self.agol.overwrite_arcgis_layer("county_summaries", processed_dir, new_data_filename, dry_run=self.dry_run)
207+
208+
if self.verbose:
209+
print(status)
210+
print("Finished load of county summary data")
196211

197212
def process_summaries(self, processed_dir, processed_file_details, make_historical_csv=False):
198-
print("Starting load of summary table...")
199213

200-
summary_filename = self.gis.layers['summary_table']['original_file_name']
214+
if self.verbose:
215+
print("Starting load of summary table...")
216+
217+
summary_filename = self.agol.layers['summary_table']['original_file_name']
201218

202219
summary_df = pd.DataFrame()
203220
for f in processed_file_details:
@@ -213,14 +230,15 @@ def process_summaries(self, processed_dir, processed_file_details, make_historic
213230
if make_historical_csv:
214231
out_csv_file = os.path.join(processed_dir, summary_filename)
215232
summary_df.to_csv(out_csv_file, index=False, header=True)
216-
print("Finished creation of historical summary table CSV, returning.")
233+
if self.verbose:
234+
print("Finished creation of historical summary table CSV, returning.")
217235
return
218236

219-
layer_conf = self.gis.layers['summary_table']
237+
layer_conf = self.agol.layers['summary_table']
220238

221239
# this self.gis.gis.content pattern is evidence that the first pass at
222240
# a refactored structure should not be the last...
223-
table = self.gis.gis.content.get(layer_conf['id'])
241+
table = self.agol.gis.content.get(layer_conf['id'])
224242
t = table.tables[0]
225243

226244
new_col_names = {}
@@ -237,64 +255,47 @@ def process_summaries(self, processed_dir, processed_file_details, make_historic
237255
# but it won't stop the processing.
238256
fs = FeatureSet(features)
239257
if self.dry_run:
240-
print("Dry run set, not editing features.")
241-
status = "Dry run"
258+
if self.verbose:
259+
print("Dry run set, not editing features.")
242260
else:
243261
status = t.edit_features(adds=features)
244-
print(status)
245-
print("Finished load of summary table")
262+
if self.verbose:
263+
print(status)
264+
if self.verbose:
265+
print("Finished load of summary table")
246266

247267

248268
def process_historical_hos(self, processed_dir, processed_file_details, make_historical_csv=False):
249269

250-
print("Starting load of historical HOS table...")
270+
if self.verbose:
271+
print("Starting load of historical HOS table...")
251272

252-
layer_conf = self.gis.layers['full_historical_table']
273+
layer_conf = self.agol.layers['full_historical_table']
253274
original_data_file_name = layer_conf['original_file_name']
254275

255-
table = self.gis.gis.content.get(layer_conf['id'])
276+
table = self.agol.gis.content.get(layer_conf['id'])
256277
t = table.layers[0]
257278

258279
# get short field names that are in use online to test the input csv headers
259-
agol_fields = {n["alias"]: n["name"] for n in t.properties.fields}
280+
# not used now but retained in case of future needs
281+
# agol_fields = {n["alias"]: n["name"] for n in t.properties.fields}
260282

261283
# iterate all csvs and collect the information from each one.
262284
# normalize header names at the same time
263285
hist_csv_rows = []
264-
mapping = hm.HeaderMapping("HOS")
265-
alias_lookup = mapping.get_alias_lookup() # used to convert historical names (pre-5/12)
266-
valid_fieldnames = mapping.get_fieldnames() # valid short names (post-5/12)
267286
for f in processed_file_details:
268287
fname = f["processed_filename"]
269288
size = os.path.getsize(os.path.join(processed_dir, fname))
270289
if size > 0:
271-
processed_time = datetime.utcnow().isoformat()
290+
processed_time = datetime.utcnow().isoformat()
272291
with open(os.path.join(processed_dir, fname), newline='') as csvfile:
273292
reader = csv.DictReader(csvfile)
274293
for row in reader:
275294

276-
out_row = {}
277-
278-
for col_name, value in row.items():
279-
# first test if col_name in new valid headers
280-
# if so, use the col_name and value directly
281-
if col_name in valid_fieldnames:
282-
out_row[col_name] = value
283-
# next test if col_name in lookup of old aliases
284-
# if so, convert the col_name
285-
elif col_name in alias_lookup:
286-
out_row[alias_lookup[col_name]] = value
287-
print(f"Found a long name in {fname}: {col_name}")
288-
# finally, raise exception if the field can't be matched
289-
else:
290-
msg = f"{fname}: Can't match field '{col_name}'"
291-
raise ValidationError(msg)
292-
293-
out_row["Source Data Timestamp"] = f["source_datetime"].isoformat()
294-
out_row["Processed At"] = processed_time
295-
out_row["Source Filename"] = f["filename"]
296-
297-
hist_csv_rows.append(out_row)
295+
row["Source Data Timestamp"] = f["source_datetime"].isoformat()
296+
row["Processed At"] = processed_time
297+
row["Source Filename"] = f["filename"]
298+
hist_csv_rows.append(row)
298299

299300
else:
300301
print(f"{fname} has a filesize of {size}, not processing.")
@@ -312,28 +313,31 @@ def process_historical_hos(self, processed_dir, processed_file_details, make_hi
312313
features = [Feature(attributes=row) for row in hist_csv_rows]
313314
fs = FeatureSet(features)
314315
if self.dry_run:
315-
print("Dry run set, not editing features.")
316+
if self.verbose:
317+
print("Dry run set, not editing features.")
316318
else:
317319
fc = len(features)
318320
chunksize = 1000.0
319321
feature_batchs = chunks(features, math.ceil(fc / chunksize))
320322
fb_list = list(feature_batchs)
321323
fbc = len(fb_list)
322-
print(f"Adding {fc} features to the historical table in {fbc} batches.")
324+
if self.verbose:
325+
print(f"Adding {fc} features to the historical table in {fbc} batches.")
323326
for batch in fb_list:
324327
status = t.edit_features(adds=batch)
325328
print(status)
326-
print("Finished load of historical HOS table")
329+
if self.verbose:
330+
print("Finished load of historical HOS table")
327331

328332
def process_daily_hospital_averages(self, historical_gis_item_id, daily_averages_item_id):
329333
# see what days have been processed
330-
# if not processed,
334+
# if not processed,
331335
# get the historical table
332336
# turn it into a df
333337
# per day, get the averages
334338
# for new: days
335339
print("XXX daily_hospital_averages stub, returning.")
336-
table = self.gis.content.get(historical_gis_item_id)
340+
table = self.agol.gis.content.get(historical_gis_item_id)
337341
t = table.layers[0]
338342

339343

0 commit comments

Comments
 (0)