add enrichment from ADP sheet

johnseekins · johnseekins · commit d21af029a145 · 2025-09-07T18:06:14.000-06:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 __pycache__/
 *.csv
 *.json
+*.xlsx
diff --git a/default_data.py b/default_data.py
diff --git a/enricher.py b/enricher.py
@@ -1,4 +1,6 @@
 import copy
+import os
+import polars
 from schemas import (
     facilities_schema,
     resp_info_schema,
@@ -9,18 +11,97 @@
     logger,
     session,
 )
-
 # ExternalDataEnricher class for enrichment logic
 
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 # Rate limiting for API calls
 NOMINATIM_DELAY = 1.0  # 1 second between requests as per OSM policy
 WIKIPEDIA_DELAY = 0.5  # Be respectful to Wikipedia
 WIKIDATA_DELAY = 0.5  # Be respectful to Wikidata
 
+# extracted ADP sheet header list 2025-09-07
+facility_sheet_header = [
+    "Name",
+    "Address",
+    "City",
+    "State",
+    "Zip",
+    "AOR",
+    "Type Detailed",
+    "Male/Female",
+    "FY25 ALOS",
+    "Level A",
+    "Level B",
+    "Level C",
+    "Level D",
+    "Male Crim",
+    "Male Non-Crim",
+    "Female Crim",
+    "Female Non-Crim",
+    "ICE Threat Level 1",
+    "ICE Threat Level 2",
+    "ICE Threat Level 3",
+    "No ICE Threat Level",
+    "Mandatory",
+    "Guaranteed Minimum",
+    "Last Inspection Type",
+    "Last Inspection End Date",
+    "Pending FY25 Inspection",
+    "Last Inspection Standard",
+    "Last Final Rating",
+]
+
 
 class ExternalDataEnricher(object):
     def __init__(self):
-        pass
+        self.sheet_url = "https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx"
+        self.filename = f"{SCRIPT_DIR}{os.sep}detentionstats.xlsx"
+        self.adp_sheet_data = self._load_sheet()
+
+    def _download_sheet(self) -> None:
+        if not os.path.isfile(self.filename) or os.path.getsize(self.filename) < 1:
+            logger.info("Downloading sheet from %s", self.sheet_url)
+            resp = session.get(self.sheet_url, timeout=120)
+            with open(self.filename, "wb") as f:
+                for chunk in resp.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+
+    def _load_sheet(self) -> polars.DataFrame:
+        """Convert the detentionstats sheet data into something we can update our facilities with"""
+        self._download_sheet()
+        df = polars.read_excel(
+            drop_empty_rows=True,
+            has_header=False,
+            # because we're manually defining the header...
+            read_options={"skip_rows": 7, "column_names": facility_sheet_header},
+            sheet_name="Facilities FY25",
+            source=open(self.filename, "rb"),
+        )
+        results: dict = {}
+        for row in df.iter_rows(named=True):
+            full_address = f"{row['Address']} {row['City']}, {row['State']} {row['Zip']}".upper()
+            results[full_address] = row
+        return results
+
+    def _update_from_sheet(self, base: dict, row: dict) -> dict:
+        base["population"]["male"]["criminal"] = row["Male Crim"]
+        base["population"]["male"]["non_criminal"] = row["Male Non-Crim"]
+        base["population"]["female"]["criminal"] = row["Female Crim"]
+        base["population"]["female"]["non_criminal"] = row["Female Non-Crim"]
+        if "/" in row["Male/Female"]:
+            base["population"]["female"]["allowed"] = True
+            base["population"]["male"]["allowed"] = True
+        elif "Female" in row["Male/Female"]:
+            base["population"]["female"]["allowed"] = True
+        else:
+            base["population"]["male"]["allowed"] = True
+
+        base["base_type"] = row["Type Detailed"]
+        base["avg_stay_length"] = row["FY25 ALOS"]
+        base["inspection_date"] = row["Last Inspection End Date"]
+        logger.debug("Updated facility: %s", base)
+        return base
 
     def enrich_facility_data(self, facilities_data: dict) -> dict:
         start_time = time.time()
@@ -32,7 +113,21 @@ def enrich_facility_data(self, facilities_data: dict) -> dict:
             facility_name = facility["name"]
             logger.info("Processing facility %s/%s: %s...", index + 1, total, facility_name)
             enriched_facility = copy.deepcopy(facility)
-
+            addr = facility["address"]
+            full_address = (
+                f"{addr['street']} {addr['locality']}, {addr['administrative_area']} {addr['postal_code']}".upper()
+            )
+            if full_address in self.adp_sheet_data:
+                row = self.adp_sheet_data[full_address]
+                logger.debug("Found additional data in the ADP sheet for %s", facility_name)
+                facility = self._update_from_sheet(facility, row)
+            else:
+                logger.debug("Just making sure no other facilities match...")
+                for sheet_row in self.adp_sheet_data.values():
+                    if facility_name.upper() == sheet_row["Name"].upper():
+                        logger.debug("Matching facility for %s", facility_name)
+                        facility = self._update_from_sheet(facility, sheet_row)
+                        break
             # Wikipedia search # todo refactor to method
             try:
                 wiki_result = self._search_wikipedia(facility_name)
diff --git a/file_utils.py b/file_utils.py
@@ -1,12 +1,15 @@
 import copy
 import csv
 import json
+import os
 from schemas import enrichment_print_schema
 from utils import (
     _flatdict,
     logger,
 )
 
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+
 
 def export_to_file(
     facilities_data: dict,
@@ -17,7 +20,7 @@ def export_to_file(
         logger.warning("No data to export!")
         return ""
 
-    full_name = f"{filename}.{file_type}"
+    full_name = f"{SCRIPT_DIR}{os.sep}{filename}.{file_type}"
     csv_filtered_keys = ["raw_scrape", "wikipedia_search_query", "wikidata_search_query", "osm_search_query"]
     try:
         with open(full_name, "w", newline="", encoding="utf-8") as f_out:
@@ -56,7 +59,6 @@ def print_summary(facilities_data: dict) -> None:
     total_facilities = len(facilities_data["facilities"])
     logger.info("\n=== ICE Detention Facilities Scraper Summary ===")
     logger.info("Scraped data at %s", facilities_data["scraped_date"])
-    logger.info("ice.gov data updated at %s", facilities_data["page_updated_date"])
     logger.info("Total facilities: %s", total_facilities)
 
     # Count by field office
diff --git a/main.py b/main.py
@@ -26,7 +26,6 @@
 from enricher import ExternalDataEnricher
 from scraper import ICEGovFacilityScraper
 from utils import logger
-
 # CLI, argument parsing, script orchestration
 
 
@@ -82,9 +81,7 @@ def main() -> None:
         facilities_data = scraper.scrape_facilities()
     elif args.load_existing:
         facilities_data = copy.deepcopy(default_data.facilities_data)
-        logger.info(
-            "Loaded %s existing facilities from local data. (Not scraping ICE.gov)", len(facilities_data["facilities"])
-        )
+        logger.info("Loaded %s existing facilities from local data. (Not scraping)", len(facilities_data["facilities"]))
 
     if args.enrich:
         if not facilities_data:
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,10 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
     "beautifulsoup4>=4.13.5",
+    "fastexcel>=0.15.1",
     "lxml>=6.0.1",
+    "polars>=1.33.0",
+    "pyarrow>=21.0.0",
     "requests>=2.32.5",
 ]
 
diff --git a/schemas.py b/schemas.py
@@ -2,33 +2,77 @@
 
 facilities_schema = {
     "scraped_date": datetime.datetime.utcnow(),
-    "page_updated_date": datetime.datetime.utcnow(),
     "scrape_runtime": 0,
     "enrich_runtime": 0,
     "facilities": [],
 }
 
+# default keys to "false"-y values so we can merge easier
 facility_schema = {
     "address": {
-        "street": "",
         "administrative_area": "",
         "country": "",
         "locality": "",
         "postal_code": "",
+        "street": "",
     },
     "facility_url": "",
     "field_office": "",
     "image_url": "",
     "name": "",
     "phone": "",
     "raw_scrape": "",
-    "source_url": "",
+    "source_urls": [],
     "wikipedia_page_url": "",
     "wikidata_page_url": "",
     "osm_result_url": "",
     "wikipedia_search_query": "",
     "wikidata_search_query": "",
     "osm_search_query": "",
+    "page_updated_date": None,
+    "population": {
+        "male": {
+            "allowed": False,
+            "criminal": 0,
+            "non_criminal": 0,
+        },
+        "female": {
+            "allowed": False,
+            "criminal": 0,
+            "non_criminal": 0,
+        },
+    },
+    "facility_type": "",
+    "inspection_date": None,
+    "avg_stay_length": 0,
+}
+
+# extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx 2025-09-07
+ice_facility_types = {
+    "BOP": {
+        "expanded_name": "Federal Bureau of Prisons",
+        "description": "A facility operated by the Federal Bureau of Prisons",
+    },
+    "DIGSA": {
+        "expanded_name": "Dedicated Intergovernmental Service Agreement",
+        "decsription": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Inter-governmental Service Agreements, which house only ICE detainees – typically these are operated by private contractors pursuant to their agreements with local governments.",
+    },
+    "IGSA": {
+        "expanded_name": "Intergovernmental Service Agreement",
+        "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts for bed space via an Intergovernmental Service Agreement; or local jails used by ICE pursuant to Inter-governmental Service Agreements, which house both ICE and non-ICE detainees, typically county prisoners awaiting trial or serving short sentences, but sometimes also USMS prisoners.",
+    },
+    "SPC": {
+        "expanded_name": "Service Processing Center",
+        "description": "A facility owned by the government and staffed by a combination of federal and contract employees.",
+    },
+    "USMS": {
+        "expanded_name": "United States Marshals Service",
+        "description": "A facility primarily contracted with the USMS for housing of USMS detainees, in which ICE contracts with the USMS for bed space.",
+    },
+    "USMSIGA": {
+        "expanded_name": "USMS Intergovernmental Agreement",
+        "description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
+    },
 }
 
 # enrichment response object
diff --git a/scraper.py b/scraper.py
@@ -10,25 +10,25 @@
 )
 import time
 from utils import (
+    default_timestamp,
     logger,
     session,
+    timestamp_format,
 )
 
 
 class ICEGovFacilityScraper(object):
-    # All methods for scraping ICE websites
-
+    # All methods for scraping ice.gov websites
     def __init__(self):
         self.base_url = "https://www.ice.gov/detention-facilities"
         self.facilities_data = copy.deepcopy(facilities_schema)
 
     def scrape_facilities(self):
         """Scrape all ICE detention facility data from all 6 pages"""
         start_time = time.time()
-        logger.info("Starting to scrape ICE detention facilities...")
+        logger.info("Starting to scrape ICE.gov detention facilities...")
 
         self.facilities_data["scraped_date"] = datetime.datetime.utcnow()
-        self.facilities_data["page_updated_date"] = self._scrape_updated(self.base_url)
         # URLs for all pages
         urls = [f"{self.base_url}?exposed_form_display=1&page={i}" for i in range(6)]
 
@@ -50,8 +50,9 @@ def scrape_facilities(self):
 
     def _scrape_updated(self, url: str):
         """Scrape first page to get "last updated" time"""
-        default_timestamp = "1970-01-01T00:00:00-+0000"
-        timestamp_format = "%Y-%m-%dT%H:%M:%S-%z"
+        if not url:
+            logger.error("Could not find a time block! Guessing wildly!")
+            return datetime.datetime.strptime(default_timestamp, timestamp_format)
         logger.debug("  Fetching: %s", url)
         try:
             response = session.get(url, timeout=30)
@@ -181,7 +182,7 @@ def _extract_single_facility(self, element, page_url):
         facility = copy.deepcopy(facility_schema)
         raw_scrape = str(element)
         facility["raw_scrape"] = base64.b64encode(raw_scrape.encode("utf-8")).decode("utf-8")
-        facility["source_url"] = page_url
+        facility["source_urls"].append(page_url)
         logger.debug("Trying to get facility data from %s", element)
         # Method 1: Try structured extraction if element has proper HTML structure
         name = element.select_one(".views-field-title")
@@ -221,6 +222,7 @@ def _extract_single_facility(self, element, page_url):
         facility_url_element = element.findAll("a")
         if facility_url_element:
             facility["facility_url"] = f"https://www.ice.gov{facility_url_element[0]['href']}"
+        facility["page_updated_date"] = self._scrape_updated(facility.get("facility_url", ""))
         # Clean up extracted data
         facility = self._clean_facility_data(facility)
 
diff --git a/utils.py b/utils.py
@@ -18,6 +18,9 @@
 session.mount("http://", _adapter)
 session.headers.update({"User-Agent": "ICE-Facilities-Research/1.0 (Educational Research Purpose)"})
 
+default_timestamp = "1970-01-01T00:00:00-+0000"
+timestamp_format = "%Y-%m-%dT%H:%M:%S-%z"
+
 
 def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
     """flatten a nested dictionary for nicer printing in CSV"""
diff --git a/uv.lock b/uv.lock

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 __pycache__/
 *.csv
 *.json
 +*.xlsx