Skip to content

Commit d21af02

Browse files
committed
add enrichment from ADP sheet
Signed-off-by: John Seekins <[email protected]>
1 parent dad899d commit d21af02

File tree

10 files changed

+1303
-142
lines changed

10 files changed

+1303
-142
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
__pycache__/
22
*.csv
33
*.json
4+
*.xlsx

default_data.py

Lines changed: 1082 additions & 123 deletions
Large diffs are not rendered by default.

enricher.py

Lines changed: 98 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import copy
2+
import os
3+
import polars
24
from schemas import (
35
facilities_schema,
46
resp_info_schema,
@@ -9,18 +11,97 @@
911
logger,
1012
session,
1113
)
12-
1314
# ExternalDataEnricher class for enrichment logic
1415

16+
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
1517
# Rate limiting for API calls
1618
NOMINATIM_DELAY = 1.0 # 1 second between requests as per OSM policy
1719
WIKIPEDIA_DELAY = 0.5 # Be respectful to Wikipedia
1820
WIKIDATA_DELAY = 0.5 # Be respectful to Wikidata
1921

22+
# extracted ADP sheet header list 2025-09-07
23+
facility_sheet_header = [
24+
"Name",
25+
"Address",
26+
"City",
27+
"State",
28+
"Zip",
29+
"AOR",
30+
"Type Detailed",
31+
"Male/Female",
32+
"FY25 ALOS",
33+
"Level A",
34+
"Level B",
35+
"Level C",
36+
"Level D",
37+
"Male Crim",
38+
"Male Non-Crim",
39+
"Female Crim",
40+
"Female Non-Crim",
41+
"ICE Threat Level 1",
42+
"ICE Threat Level 2",
43+
"ICE Threat Level 3",
44+
"No ICE Threat Level",
45+
"Mandatory",
46+
"Guaranteed Minimum",
47+
"Last Inspection Type",
48+
"Last Inspection End Date",
49+
"Pending FY25 Inspection",
50+
"Last Inspection Standard",
51+
"Last Final Rating",
52+
]
53+
2054

2155
class ExternalDataEnricher(object):
2256
def __init__(self):
23-
pass
57+
self.sheet_url = "https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx"
58+
self.filename = f"{SCRIPT_DIR}{os.sep}detentionstats.xlsx"
59+
self.adp_sheet_data = self._load_sheet()
60+
61+
def _download_sheet(self) -> None:
62+
if not os.path.isfile(self.filename) or os.path.getsize(self.filename) < 1:
63+
logger.info("Downloading sheet from %s", self.sheet_url)
64+
resp = session.get(self.sheet_url, timeout=120)
65+
with open(self.filename, "wb") as f:
66+
for chunk in resp.iter_content(chunk_size=1024):
67+
if chunk:
68+
f.write(chunk)
69+
70+
def _load_sheet(self) -> polars.DataFrame:
71+
"""Convert the detentionstats sheet data into something we can update our facilities with"""
72+
self._download_sheet()
73+
df = polars.read_excel(
74+
drop_empty_rows=True,
75+
has_header=False,
76+
# because we're manually defining the header...
77+
read_options={"skip_rows": 7, "column_names": facility_sheet_header},
78+
sheet_name="Facilities FY25",
79+
source=open(self.filename, "rb"),
80+
)
81+
results: dict = {}
82+
for row in df.iter_rows(named=True):
83+
full_address = f"{row['Address']} {row['City']}, {row['State']} {row['Zip']}".upper()
84+
results[full_address] = row
85+
return results
86+
87+
def _update_from_sheet(self, base: dict, row: dict) -> dict:
88+
base["population"]["male"]["criminal"] = row["Male Crim"]
89+
base["population"]["male"]["non_criminal"] = row["Male Non-Crim"]
90+
base["population"]["female"]["criminal"] = row["Female Crim"]
91+
base["population"]["female"]["non_criminal"] = row["Female Non-Crim"]
92+
if "/" in row["Male/Female"]:
93+
base["population"]["female"]["allowed"] = True
94+
base["population"]["male"]["allowed"] = True
95+
elif "Female" in row["Male/Female"]:
96+
base["population"]["female"]["allowed"] = True
97+
else:
98+
base["population"]["male"]["allowed"] = True
99+
100+
base["base_type"] = row["Type Detailed"]
101+
base["avg_stay_length"] = row["FY25 ALOS"]
102+
base["inspection_date"] = row["Last Inspection End Date"]
103+
logger.debug("Updated facility: %s", base)
104+
return base
24105

25106
def enrich_facility_data(self, facilities_data: dict) -> dict:
26107
start_time = time.time()
@@ -32,7 +113,21 @@ def enrich_facility_data(self, facilities_data: dict) -> dict:
32113
facility_name = facility["name"]
33114
logger.info("Processing facility %s/%s: %s...", index + 1, total, facility_name)
34115
enriched_facility = copy.deepcopy(facility)
35-
116+
addr = facility["address"]
117+
full_address = (
118+
f"{addr['street']} {addr['locality']}, {addr['administrative_area']} {addr['postal_code']}".upper()
119+
)
120+
if full_address in self.adp_sheet_data:
121+
row = self.adp_sheet_data[full_address]
122+
logger.debug("Found additional data in the ADP sheet for %s", facility_name)
123+
facility = self._update_from_sheet(facility, row)
124+
else:
125+
logger.debug("Just making sure no other facilities match...")
126+
for sheet_row in self.adp_sheet_data.values():
127+
if facility_name.upper() == sheet_row["Name"].upper():
128+
logger.debug("Matching facility for %s", facility_name)
129+
facility = self._update_from_sheet(facility, sheet_row)
130+
break
36131
# Wikipedia search # todo refactor to method
37132
try:
38133
wiki_result = self._search_wikipedia(facility_name)

file_utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
import copy
22
import csv
33
import json
4+
import os
45
from schemas import enrichment_print_schema
56
from utils import (
67
_flatdict,
78
logger,
89
)
910

11+
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
12+
1013

1114
def export_to_file(
1215
facilities_data: dict,
@@ -17,7 +20,7 @@ def export_to_file(
1720
logger.warning("No data to export!")
1821
return ""
1922

20-
full_name = f"{filename}.{file_type}"
23+
full_name = f"{SCRIPT_DIR}{os.sep}{filename}.{file_type}"
2124
csv_filtered_keys = ["raw_scrape", "wikipedia_search_query", "wikidata_search_query", "osm_search_query"]
2225
try:
2326
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
@@ -56,7 +59,6 @@ def print_summary(facilities_data: dict) -> None:
5659
total_facilities = len(facilities_data["facilities"])
5760
logger.info("\n=== ICE Detention Facilities Scraper Summary ===")
5861
logger.info("Scraped data at %s", facilities_data["scraped_date"])
59-
logger.info("ice.gov data updated at %s", facilities_data["page_updated_date"])
6062
logger.info("Total facilities: %s", total_facilities)
6163

6264
# Count by field office

main.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from enricher import ExternalDataEnricher
2727
from scraper import ICEGovFacilityScraper
2828
from utils import logger
29-
3029
# CLI, argument parsing, script orchestration
3130

3231

@@ -82,9 +81,7 @@ def main() -> None:
8281
facilities_data = scraper.scrape_facilities()
8382
elif args.load_existing:
8483
facilities_data = copy.deepcopy(default_data.facilities_data)
85-
logger.info(
86-
"Loaded %s existing facilities from local data. (Not scraping ICE.gov)", len(facilities_data["facilities"])
87-
)
84+
logger.info("Loaded %s existing facilities from local data. (Not scraping)", len(facilities_data["facilities"]))
8885

8986
if args.enrich:
9087
if not facilities_data:

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@ readme = "README.md"
66
requires-python = ">=3.13"
77
dependencies = [
88
"beautifulsoup4>=4.13.5",
9+
"fastexcel>=0.15.1",
910
"lxml>=6.0.1",
11+
"polars>=1.33.0",
12+
"pyarrow>=21.0.0",
1013
"requests>=2.32.5",
1114
]
1215

schemas.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,77 @@
22

33
facilities_schema = {
44
"scraped_date": datetime.datetime.utcnow(),
5-
"page_updated_date": datetime.datetime.utcnow(),
65
"scrape_runtime": 0,
76
"enrich_runtime": 0,
87
"facilities": [],
98
}
109

10+
# default keys to "false"-y values so we can merge easier
1111
facility_schema = {
1212
"address": {
13-
"street": "",
1413
"administrative_area": "",
1514
"country": "",
1615
"locality": "",
1716
"postal_code": "",
17+
"street": "",
1818
},
1919
"facility_url": "",
2020
"field_office": "",
2121
"image_url": "",
2222
"name": "",
2323
"phone": "",
2424
"raw_scrape": "",
25-
"source_url": "",
25+
"source_urls": [],
2626
"wikipedia_page_url": "",
2727
"wikidata_page_url": "",
2828
"osm_result_url": "",
2929
"wikipedia_search_query": "",
3030
"wikidata_search_query": "",
3131
"osm_search_query": "",
32+
"page_updated_date": None,
33+
"population": {
34+
"male": {
35+
"allowed": False,
36+
"criminal": 0,
37+
"non_criminal": 0,
38+
},
39+
"female": {
40+
"allowed": False,
41+
"criminal": 0,
42+
"non_criminal": 0,
43+
},
44+
},
45+
"facility_type": "",
46+
"inspection_date": None,
47+
"avg_stay_length": 0,
48+
}
49+
50+
# extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx 2025-09-07
51+
ice_facility_types = {
52+
"BOP": {
53+
"expanded_name": "Federal Bureau of Prisons",
54+
"description": "A facility operated by the Federal Bureau of Prisons",
55+
},
56+
"DIGSA": {
57+
"expanded_name": "Dedicated Intergovernmental Service Agreement",
58+
"decsription": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Inter-governmental Service Agreements, which house only ICE detainees – typically these are operated by private contractors pursuant to their agreements with local governments.",
59+
},
60+
"IGSA": {
61+
"expanded_name": "Intergovernmental Service Agreement",
62+
"description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts for bed space via an Intergovernmental Service Agreement; or local jails used by ICE pursuant to Inter-governmental Service Agreements, which house both ICE and non-ICE detainees, typically county prisoners awaiting trial or serving short sentences, but sometimes also USMS prisoners.",
63+
},
64+
"SPC": {
65+
"expanded_name": "Service Processing Center",
66+
"description": "A facility owned by the government and staffed by a combination of federal and contract employees.",
67+
},
68+
"USMS": {
69+
"expanded_name": "United States Marshals Service",
70+
"description": "A facility primarily contracted with the USMS for housing of USMS detainees, in which ICE contracts with the USMS for bed space.",
71+
},
72+
"USMSIGA": {
73+
"expanded_name": "USMS Intergovernmental Agreement",
74+
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
75+
},
3276
}
3377

3478
# enrichment response object

scraper.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,25 @@
1010
)
1111
import time
1212
from utils import (
13+
default_timestamp,
1314
logger,
1415
session,
16+
timestamp_format,
1517
)
1618

1719

1820
class ICEGovFacilityScraper(object):
19-
# All methods for scraping ICE websites
20-
21+
# All methods for scraping ice.gov websites
2122
def __init__(self):
2223
self.base_url = "https://www.ice.gov/detention-facilities"
2324
self.facilities_data = copy.deepcopy(facilities_schema)
2425

2526
def scrape_facilities(self):
2627
"""Scrape all ICE detention facility data from all 6 pages"""
2728
start_time = time.time()
28-
logger.info("Starting to scrape ICE detention facilities...")
29+
logger.info("Starting to scrape ICE.gov detention facilities...")
2930

3031
self.facilities_data["scraped_date"] = datetime.datetime.utcnow()
31-
self.facilities_data["page_updated_date"] = self._scrape_updated(self.base_url)
3232
# URLs for all pages
3333
urls = [f"{self.base_url}?exposed_form_display=1&page={i}" for i in range(6)]
3434

@@ -50,8 +50,9 @@ def scrape_facilities(self):
5050

5151
def _scrape_updated(self, url: str):
5252
"""Scrape first page to get "last updated" time"""
53-
default_timestamp = "1970-01-01T00:00:00-+0000"
54-
timestamp_format = "%Y-%m-%dT%H:%M:%S-%z"
53+
if not url:
54+
logger.error("Could not find a time block! Guessing wildly!")
55+
return datetime.datetime.strptime(default_timestamp, timestamp_format)
5556
logger.debug(" Fetching: %s", url)
5657
try:
5758
response = session.get(url, timeout=30)
@@ -181,7 +182,7 @@ def _extract_single_facility(self, element, page_url):
181182
facility = copy.deepcopy(facility_schema)
182183
raw_scrape = str(element)
183184
facility["raw_scrape"] = base64.b64encode(raw_scrape.encode("utf-8")).decode("utf-8")
184-
facility["source_url"] = page_url
185+
facility["source_urls"].append(page_url)
185186
logger.debug("Trying to get facility data from %s", element)
186187
# Method 1: Try structured extraction if element has proper HTML structure
187188
name = element.select_one(".views-field-title")
@@ -221,6 +222,7 @@ def _extract_single_facility(self, element, page_url):
221222
facility_url_element = element.findAll("a")
222223
if facility_url_element:
223224
facility["facility_url"] = f"https://www.ice.gov{facility_url_element[0]['href']}"
225+
facility["page_updated_date"] = self._scrape_updated(facility.get("facility_url", ""))
224226
# Clean up extracted data
225227
facility = self._clean_facility_data(facility)
226228

utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
session.mount("http://", _adapter)
1919
session.headers.update({"User-Agent": "ICE-Facilities-Research/1.0 (Educational Research Purpose)"})
2020

21+
default_timestamp = "1970-01-01T00:00:00-+0000"
22+
timestamp_format = "%Y-%m-%dT%H:%M:%S-%z"
23+
2124

2225
def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
2326
"""flatten a nested dictionary for nicer printing in CSV"""

0 commit comments

Comments
 (0)