Skip to content

Commit 4cb85a3

Browse files
committed
more matching fixes and actually propagate updates down nested dicts
Signed-off-by: John Seekins <[email protected]>
1 parent 1980951 commit 4cb85a3

File tree

1 file changed

+16
-5
lines changed

1 file changed

+16
-5
lines changed

scraper.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
7777
{"match": "1623 E J Street, Suite 2", "replace": "1623 E. J STREET", "locality": "Tacoma"},
7878
{"match": "1805 W 32nd Street", "replace": "1805 W 32ND ST", "locality": "Baldwin"},
7979
{"match": "500 Hilbig Road", "replace": "500 HILBIG RD", "locality": "Conroe"},
80+
{"match": "806 Hilbig Road", "replace": "806 HILBIG RD", "locality": "Conroe"},
8081
{"match": "425 Golden State Avenue", "replace": "425 Golden State Ave", "locality": "Bakersfield"},
8182
{"match": "832 East Texas HWY 44", "replace": "832 EAST TEXAS STATE HIGHWAY 44", "locality": "Encinal"},
8283
{"match": "18201 SW 12th Street", "replace": "18201 SW 12TH ST", "locality": "Miami"},
@@ -88,7 +89,7 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
8889
{"match": "1701 North Washington", "replace": "1701 NORTH WASHINGTON ST", "locality": "Grand Forks"},
8990
{"match": "611 Frontage Road", "replace": "611 FRONTAGE RD", "locality": "McFarland"},
9091
{"match": "12450 Merritt Road", "replace": "12450 MERRITT DR", "locality": "Chardon"},
91-
{"match": "411 S. Broadway Avenue", "replace": "411 SOUTH BROADWAY AVENUE", "locality": "Chardon"},
92+
{"match": "411 S. Broadway Avenue", "replace": "411 SOUTH BROADWAY AVENUE", "locality": "Albert Lea"},
9293
{"match": "3424 Hwy 252 E", "replace": "3424 HIGHWAY 252 EAST", "locality": "Folkston"},
9394
# a unique one, 'cause the PHONE NUMBER IS IN THE ADDRESS?!
9495
{"match": "911 PARR BLVD 775 328 3308", "replace": "911 E Parr Blvd", "locality": "RENO"},
@@ -142,6 +143,9 @@ def _repair_locality(self, locality: str, administrative_area: str) -> Tuple[str
142143
if locality == "Leachfield" and administrative_area == "KY":
143144
locality = "LEITCHFIELD"
144145
cleaned = True
146+
if locality == "Susupe, Saipan" and administrative_area == "MP":
147+
locality = "SAIPAN"
148+
cleaned = True
145149
return locality, cleaned
146150

147151
def _load_sheet(self) -> dict:
@@ -199,6 +203,14 @@ def _load_sheet(self) -> dict:
199203
results[full_address] = details
200204
return results
201205

206+
def _update_facility(self, old: dict, new: dict) -> dict:
207+
for k, v in new.items():
208+
if isinstance(v, dict):
209+
old[k] = self._update_facility(old[k], new[k])
210+
if not old.get(k, None):
211+
old[k] = v
212+
return old
213+
202214
def scrape_facilities(self):
203215
"""Scrape all ICE detention facility data from all 6 pages"""
204216
start_time = time.time()
@@ -230,10 +242,9 @@ def scrape_facilities(self):
230242
facility["_repaired_record"] = True
231243
full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper()
232244
if full_address in self.facilities_data["facilities"].keys():
233-
for key, value in facility.items():
234-
if self.facilities_data["facilities"][full_address].get(key, None):
235-
continue
236-
self.facilities_data["facilities"][full_address][key] = value
245+
self.facilities_data["facilities"][full_address] = self._update_facility(
246+
self.facilities_data["facilities"][full_address], facility
247+
)
237248
# this is likely to produce _some_ duplicates, but it's a reasonable starting place
238249
else:
239250
self.facilities_data["facilities"][facility["name"]] = facility

0 commit comments

Comments
 (0)