Skip to content

Commit c20f803

Browse files
committed
more matches and track all scrape urls in a single list
Signed-off-by: John Seekins <[email protected]>
1 parent 23fe9c3 commit c20f803

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

schemas.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
"street": "",
1818
},
1919
"_repaired_record": False,
20-
"facility_url": "",
2120
"field_office": "",
2221
"image_url": "",
2322
"name": "",

scraper.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,14 @@ def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
115115
{"match": "601 Central Avenue", "replace": "601 CENTRAL AVE", "locality": "Newport"},
116116
{"match": "501 E Court Avenue", "replace": "501 EAST COURT AVE", "locality": "Jeffersonville"},
117117
{"match": "3200 S. Kings Hwy", "replace": "3700 S KINGS HWY", "locality": "Cushing"},
118-
{"match": "325 Court Street", "replace": "325 COURT STREET", "locality": "Sault Ste. Marie"},
119118
{"match": "301 South Walnut", "replace": "301 SOUTH WALNUT STREET", "locality": "Cottonwood Falls"},
120119
{"match": "830 Pine Hill Road", "replace": "830 PINEHILL ROAD", "locality": "Jena"},
121120
{
122121
"match": "11093 SW Lewis Memorial Dr",
123122
"replace": "11093 SW LEWIS MEMORIAL DRIVE",
124123
"locality": "Bowling Green",
125124
},
125+
{"match": "58 Pine Mountain Road", "replace": "58 PINE MOUNTAIN RD", "locality": "McElhattan"},
126126
# a unique one, 'cause the PHONE NUMBER IS IN THE ADDRESS?!
127127
{"match": "911 PARR BLVD 775 328 3308", "replace": "911 E Parr Blvd", "locality": "RENO"},
128128
# default matches should come last
@@ -186,6 +186,9 @@ def _repair_locality(self, locality: str, administrative_area: str) -> Tuple[str
186186
if locality == "Cottonwood Falls" and administrative_area == "KS":
187187
locality = "COTTONWOOD FALL"
188188
cleaned = True
189+
if locality == "Sault Ste. Marie" and administrative_area == "MI":
190+
locality = "SAULT STE MARIE"
191+
cleaned = True
189192
return locality, cleaned
190193

191194
def _load_sheet(self) -> dict:
@@ -240,6 +243,7 @@ def _load_sheet(self) -> dict:
240243
details["facility_type"] = row["Type Detailed"]
241244
details["avg_stay_length"] = row["FY25 ALOS"]
242245
details["inspection_date"] = row["Last Inspection End Date"]
246+
details["source_urls"].append(self.sheet_url)
243247
results[full_address] = details
244248
return results
245249

@@ -285,6 +289,7 @@ def scrape_facilities(self):
285289
self.facilities_data["facilities"][full_address] = self._update_facility(
286290
self.facilities_data["facilities"][full_address], facility
287291
)
292+
self.facilities_data["facilities"][full_address]["address"] = addr
288293
# this is likely to produce _some_ duplicates, but it's a reasonable starting place
289294
else:
290295
self.facilities_data["facilities"][facility["name"]] = facility
@@ -466,9 +471,12 @@ def _extract_single_facility(self, element, page_url):
466471
if image_element:
467472
facility["image_url"] = f"https://www.ice.gov{image_element[0]['src']}"
468473
facility_url_element = element.findAll("a")
474+
facility_url = ""
469475
if facility_url_element:
470-
facility["facility_url"] = f"https://www.ice.gov{facility_url_element[0]['href']}"
471-
facility["page_updated_date"] = self._scrape_updated(facility.get("facility_url", ""))
476+
facility_url = f"https://www.ice.gov{facility_url_element[0]['href']}"
477+
facility["source_urls"].append(facility_url)
478+
if facility_url:
479+
facility["page_updated_date"] = self._scrape_updated(facility_url)
472480
# Clean up extracted data
473481
facility = self._clean_facility_data(facility)
474482

0 commit comments

Comments
 (0)