Skip to content

Commit 17a9fef

Browse files
committed
properly add default field office
Signed-off-by: John Seekins <[email protected]>
1 parent 385b069 commit 17a9fef

File tree

2 files changed

+14
-9
lines changed

2 files changed

+14
-9
lines changed

schemas.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
},
4444
},
4545
"facility_type": "",
46+
"facility_type_detail": {},
4647
"inspection_date": None,
4748
"avg_stay_length": 0,
4849
}
@@ -70,9 +71,13 @@
7071
"description": "A facility primarily contracted with the USMS for housing of USMS detainees, in which ICE contracts with the USMS for bed space.",
7172
},
7273
"USMSIGA": {
73-
"expanded_name": "USMS Intergovernmental Agreement",
74+
"expanded_name": "United States Marshal Service Intergovernmental Agreement",
7475
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
7576
},
77+
"USMS CDF": {
78+
"expanded_name": "United States Marshal Service Central Detention Facility",
79+
"description": "Name guessed at from searching",
80+
},
7681
}
7782

7883
# enrichment response object
@@ -90,3 +95,5 @@
9095
"wikidata_found": 0,
9196
"osm_found": 0,
9297
}
98+
99+
default_field_office = "(Possibly) Not managed by DHS field office"

scraper.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
import polars
88
import re
99
from schemas import (
10+
default_field_office,
1011
facilities_schema,
1112
facility_schema,
13+
ice_facility_types,
1214
)
1315
import time
1416
from typing import Tuple
@@ -261,9 +263,11 @@ def _load_sheet(self) -> dict:
261263
details["population"]["male"]["allowed"] = True
262264

263265
details["facility_type"] = row["Type Detailed"]
266+
details["facility_type_detail"] = ice_facility_types.get(row["Type Detailed"], {})
264267
details["avg_stay_length"] = row["FY25 ALOS"]
265268
details["inspection_date"] = row["Last Inspection End Date"]
266269
details["source_urls"].append(self.sheet_url)
270+
details["field_office"] = default_field_office
267271
results[full_address] = details
268272
return results
269273

@@ -323,6 +327,8 @@ def scrape_facilities(self):
323327
self.facilities_data["facilities"][full_address] = self._update_facility(
324328
self.facilities_data["facilities"][full_address], facility
325329
)
330+
if facility["field_office"]:
331+
self.facilities_data["facilities"][full_address]["field_office"] = facility["field_office"]
326332
# update to the frequently nicer address from ice.gov
327333
self.facilities_data["facilities"][full_address]["address"] = addr
328334
# add scraped urls
@@ -331,17 +337,9 @@ def scrape_facilities(self):
331337
if url in self.facilities_data["facilities"][full_address]["source_urls"]:
332338
continue
333339
self.facilities_data["facilities"][full_address]["source_urls"].append(url)
334-
if not self.facilities_data["facilities"][full_address].get("field_office", ""):
335-
self.facilities_data["facilities"][full_address]["field_office"] = (
336-
"(Possibly) Not managed by DHS field office"
337-
)
338340
# this is likely to produce _some_ duplicates, but it's a reasonable starting place
339341
else:
340342
self.facilities_data["facilities"][facility["name"]] = facility
341-
if not self.facilities_data["facilities"][facility["name"]].get("field_office", ""):
342-
self.facilities_data["facilities"][facility["name"]]["field_office"] = (
343-
"(Possibly) Not managed by DHS field office"
344-
)
345343

346344
self.facilities_data["scrape_runtime"] = time.time() - start_time
347345
logger.info("Total facilities scraped: %s", len(self.facilities_data["facilities"]))

0 commit comments

Comments
 (0)