start adding participating agency data

johnseekins · johnseekins · commit 6b7da7b0ef42 · 2025-11-02T13:03:28.000-07:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py
@@ -131,7 +131,9 @@
 }
 field_office_to_aor = {v: k for k, v in area_of_responsibility.items()}
 
+from .agencies import scrape_agencies  # noqa: F401,E402
 from .utils import (  # noqa: E402
+    download_file,  # noqa: F401
     get_ice_scrape_pages,  # noqa: F401
     repair_locality,  # noqa: F401
     repair_street,  # noqa: F401
diff --git a/ice_scrapers/agencies.py b/ice_scrapers/agencies.py
@@ -0,0 +1,77 @@
+# ICEFieldOfficeScraper class and scraping-related code
+from bs4 import BeautifulSoup
+import copy
+from .utils import download_file
+import os
+import polars
+import re
+from schemas import (
+    agencies_287g,
+    active_agency,
+    pending_agency,
+)
+import time
+from utils import (
+    logger,
+    session,
+)
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+base_xlsx_url = "https://www.ice.gov/identify-and-arrest/287g"
+
+
+def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
+    """Collect data on participating agencies"""
+    start_time = time.time()
+    resp = session.get(base_xlsx_url, timeout=120)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.content, "html.parser")
+    links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
+    if not links:
+        raise Exception(f"Could not find any XLSX files on {base_xlsx_url}")
+    logger.debug(links)
+    date_re = re.compile(r"\d{8}pm")
+    agencies = copy.deepcopy(agencies_287g)
+    for link in links:
+        data = list(dict())
+        match link:
+            case x if "participating" in x:
+                schema = copy.deepcopy(active_agency)
+            case x if "pending" in x:
+                schema = copy.deepcopy(pending_agency)
+            case _:
+                raise(f"Found an unsupported agency datasheet: {link}")
+        """
+        Yes, polars supports loading from a URL. But this pattern
+        lets us cache the download
+        """
+        # remove the date so we can easily overwrite the local (cached) file
+        filename = date_re.sub("", link.split('/')[-1])
+        path = f"{SCRIPT_DIR}{os.sep}{filename}"
+        if force_download or not os.path.exists(path):
+            logger.info("Downloading agency info sheet from %s", link)
+            download_file(link, path)
+        df = polars.read_excel(
+            drop_empty_rows=True,
+            raise_if_empty=True,
+            source=open(path, "rb")
+        )
+        for row in df.iter_rows(named=True):
+            data = copy.deepcopy(schema)
+            data["state"] = row["STATE"]
+            data["agency"] = row["LAW ENFORCEMENT AGENCY"]
+            data["county"] = row["COUNTY"]
+            data["type"] = row["TYPE"]
+            data["support_type"] = row["SUPPORT TYPE"]
+            if "participating" in filename:
+                data["moa"] = row["MOA"]
+                data["signed"] = row["SIGNED"]
+                data["addendum"] = row["ADDENDUM"]
+                agencies["active"].append(data)
+            else:
+                agencies["pending"].append(data)
+        if not keep_sheet:
+            os.unlink(path)
+    logger.info("  Collected %s active and %s pending agencies", len(agencies["active"]), len(agencies["pending"]))
+    agencies["scrape_runtime"] = time.time() - start_time
+    return agencies
diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py
@@ -4,13 +4,15 @@
     insert_additional_facilities,
     load_sheet,
     merge_field_offices,
+    scrape_agencies,
     scrape_facilities,
     scrape_field_offices,
 )
 from schemas import facilities_schema
 
 
-def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False) -> dict:
+def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False) -> tuple[dict, dict]:
+    agencies = scrape_agencies(keep_sheet, force_download)
     facilities_data = copy.deepcopy(facilities_schema)
     facilities = load_sheet(keep_sheet, force_download)
     facilities_data["facilities"] = copy.deepcopy(facilities)
@@ -21,4 +23,4 @@ def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = Tr
     facilities_data = merge_field_offices(facilities_data, field_offices)
     facilities_data = insert_additional_facilities(facilities_data)
 
-    return facilities_data
+    return facilities_data, agencies
diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py
@@ -2,6 +2,7 @@
 import copy
 import datetime
 from ice_scrapers import (
+    download_file,
     ice_facility_types,
     ice_inspection_types,
     repair_locality,
@@ -84,13 +85,7 @@ def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tup
     logger.debug("Found sheet at: %s", actual_link)
     if force_download or not os.path.exists(filename):
         logger.info("Downloading detention stats sheet from %s", actual_link)
-        resp = session.get(actual_link, timeout=120, stream=True)
-        size = len(resp.content)
-        with open(filename, "wb") as f:
-            for chunk in resp.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-        logger.debug("Wrote %s byte sheet to %s", size, filename)
+        download_file(actual_link, filename)
     df = polars.read_excel(
         drop_empty_rows=True,
         has_header=False,
diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py
@@ -7,6 +7,19 @@
 )
 
 
+def download_file(link: str, path: str) -> None:
+    """
+    Standard pattern for downloading a binary file from a URL
+    """
+    resp = session.get(link, timeout=120, stream=True)
+    size = len(resp.content)
+    with open(path, "wb") as f:
+        for chunk in resp.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+    logger.debug("Wrote %s byte sheet to %s", size, path)
+
+
 def special_facilities(facility: dict) -> dict:
     """
     Some very specific facilities have unique fixes
diff --git a/main.py b/main.py
@@ -128,7 +128,7 @@ def main() -> None:
         exit(1)
 
     if args.scrape:
-        facilities_data = facilities_scrape_wrapper(
+        facilities_data, agencies = facilities_scrape_wrapper(
             keep_sheet=not args.delete_sheets,
             force_download=not args.skip_downloads,
             skip_vera=args.skip_vera,
diff --git a/schemas.py b/schemas.py
@@ -105,6 +105,31 @@
     },
 }
 
+agencies_287g: dict = {
+    "active": [{}],
+    "pending": [{}],
+    "scrape_runtime": 0,
+    "scraped_date": datetime.datetime.now(datetime.UTC),
+}
+
+active_agency: dict = {
+    "state": "",
+    "agency": "",
+    "county": "",
+    "type": "",
+    "signed": None,
+    "moa": "",
+    "addendum": "",
+    "support_type": "",
+}
+
+pending_agency: dict = {
+    "state": "",
+    "agency": "",
+    "county": "",
+    "type": "",
+    "support_type": "",
+}
 
 # enrichment response object
 enrich_resp_schema: dict = {

Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,9 @@`
`131`	`131`	`}`
`132`	`132`	`field_office_to_aor = {v: k for k, v in area_of_responsibility.items()}`
`133`	`133`
	`134`	`+from .agencies import scrape_agencies # noqa: F401,E402`
`134`	`135`	`from .utils import ( # noqa: E402`
	`136`	`+ download_file, # noqa: F401`
`135`	`137`	`get_ice_scrape_pages, # noqa: F401`
`136`	`138`	`repair_locality, # noqa: F401`
`137`	`139`	`repair_street, # noqa: F401`