From dae7be8032b581ded0b211cb6e15b21e25588042 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Tue, 23 Sep 2025 17:33:14 -0600 Subject: [PATCH 01/26] add additional facility types and groupings from Vera Signed-off-by: John Seekins --- ice_scrapers/__init__.py | 41 ++++++++++++++++++++++++++++---- ice_scrapers/spreadsheet_load.py | 5 ++++ schemas.py | 1 + 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py index 742d6fa..097c36c 100644 --- a/ice_scrapers/__init__.py +++ b/ice_scrapers/__init__.py @@ -43,20 +43,57 @@ "ORSA": "Operational Review Self-Assessment", } +# extracted from https://vera-institute.files.svdcdn.com/production/downloads/dashboard_appendix.pdf 2025-09-23 +ice_facility_group_mapping = { + "Non-Dedicated": ["IGSA"], + "Dedicated": ["DIGSA", "CDF", "SPC"], + "Federal": ["BOF", "USMSIGA", "USMS IGA", "USMS CDF", "DOD", "MOC"], + "Hold/Staging": ["Hold", "Staging"], + "Family/Youth": ["Family", "Juvenile"], + "Medical": ["Hospital"], + "Hotel": ["Hotel"], + "Other/Unknown": ["Other", "Unknown"], +} + # extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx 2025-09-07 +# and https://vera-institute.files.svdcdn.com/production/downloads/dashboard_appendix.pdf 2025-09-23 ice_facility_types = { "BOP": { "expanded_name": "Federal Bureau of Prisons", "description": "A facility operated by the Federal Bureau of Prisons", }, + "CDF": { + "expanded_name": "Contract Detention Facility", + "description": "Name derived from listing at https://www.vera.org/ice-detention-trends", + }, "DIGSA": { "expanded_name": "Dedicated Intergovernmental Service Agreement", "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees – typically these are operated by private contractors pursuant to their agreements with local governments.", }, + "Family": { + "expanded_name": "Family", + "description": "A facility in which families are able to remain together while awaiting their proceedings", + }, + "Hospital": { + "expanded_name": "Hospital", + "description": "A medical facility", + }, "IGSA": { "expanded_name": "Intergovernmental Service Agreement", "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts for bed space via an Intergovernmental Service Agreement; or local jails used by ICE pursuant to Intergovernmental Service Agreements, which house both ICE and non-ICE detainees, typically county prisoners awaiting trial or serving short sentences, but sometimes also USMS prisoners.", }, + "Juvenile": { + "expanded_name": "Juvenile", + "description": "An IGSA facility capable of housing juveniles (separate from adults) for a temporary period of time", + }, + "Other": { + "expanded_name": "Other", + "description": "Facilities including but not limited to transportation-related facilities, hotels, and/or other facilities", + }, + "Unknown": { + "expanded_name": "Unknown", + "description": "A facility who's type could not be identified", + }, "SPC": { "expanded_name": "Service Processing Center", "description": "A facility owned by the government and staffed by a combination of federal and contract employees.", @@ -78,10 +115,6 @@ "expanded_name": "United States Marshals Service Contract Detention Facility", "description": "Name derived from listing at https://www.vera.org/ice-detention-trends", }, - "CDF": { - "expanded_name": "Contract Detention Facility", - "description": "Name derived from listing at https://www.vera.org/ice-detention-trends", - }, } # ICE AOR mappings diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py index 703a1a1..893dbd6 100644 --- a/ice_scrapers/spreadsheet_load.py +++ b/ice_scrapers/spreadsheet_load.py @@ -15,6 +15,7 @@ ice_inspection_types, repair_zip, repair_locality, + ice_facility_group_mapping, ) from typing import Tuple from utils import ( @@ -143,6 +144,10 @@ def load_sheet(keep_sheet: bool = True) -> dict: if ft_details: details["facility_type"]["description"] = ft_details["description"] details["facility_type"]["expanded_name"] = ft_details["expanded_name"] + for group, ids in ice_facility_group_mapping.items(): + if row["Type Detailed"] in ids: + details["facility_type"]["group"] = group + break details["avg_stay_length"] = row["FY25 ALOS"] details["inspection"] = { # fall back to type code diff --git a/schemas.py b/schemas.py index c8394fe..4e2f5ab 100644 --- a/schemas.py +++ b/schemas.py @@ -95,6 +95,7 @@ "mandatory": 0, "guaranteed_min": 0, }, + "group": "Other/Unknown", }, "inspection": { "last_type": "", From e42f685a9ef930836cf817d8aa79c17d8838205e Mon Sep 17 00:00:00 2001 From: John Seekins Date: Tue, 23 Sep 2025 17:47:05 -0600 Subject: [PATCH 02/26] start playing with vera.org facility data Signed-off-by: John Seekins --- ice_scrapers/__init__.py | 1 + ice_scrapers/vera_data.py | 32 ++++++++++++++++++++++++++++++++ main.py | 2 ++ 3 files changed, 35 insertions(+) create mode 100644 ice_scrapers/vera_data.py diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py index 097c36c..1430dde 100644 --- a/ice_scrapers/__init__.py +++ b/ice_scrapers/__init__.py @@ -160,3 +160,4 @@ merge_field_offices, # noqa: F401 scrape_field_offices, # noqa: F401 ) +from .vera_data import collect_vera_facility_data # noqa: F401,E402 diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py new file mode 100644 index 0000000..80c1312 --- /dev/null +++ b/ice_scrapers/vera_data.py @@ -0,0 +1,32 @@ +import os +import polars +from utils import ( + logger, + session, +) + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +# Github can aggressively rate-limit requests, so this may fail in surprising ways! +base_url = ( + "https://raw.githubusercontent.com/vera-institute/ice-detention-trends/refs/heads/main/metadata/facilities.csv" +) +filename = f"{SCRIPT_DIR}{os.sep}vera_facilities.csv" + + +def collect_vera_facility_data(facilities_data: dict, keep_csv: bool = True) -> dict: + res = session.get(base_url, timeout=120, stream=True) + res.raise_for_status() + size = len(res.content) + with open(filename, "wb") as f: + for chunk in res.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + logger.debug("Wrote %s byte sheet to %s", size, filename) + df = polars.read_csv( + has_header=True, + raise_if_empty=True, + source=filename, + use_pyarrow=True, + ) + logger.info(df) + exit(1) diff --git a/main.py b/main.py index 76e6d8e..aea722f 100644 --- a/main.py +++ b/main.py @@ -24,6 +24,7 @@ from file_utils import export_to_file, print_summary import default_data from ice_scrapers import ( + collect_vera_facility_data, load_sheet, merge_field_offices, scrape_facilities, @@ -123,6 +124,7 @@ def main() -> None: exit(1) if args.scrape: + facilities_data = collect_vera_facility_data(facilities_data) facilities = load_sheet() facilities_data["facilities"] = copy.deepcopy(facilities) facilities_data = scrape_facilities(facilities_data) From 6e0648ec3c056e565f4dafb7605a9d7b5abcc25e Mon Sep 17 00:00:00 2001 From: John Seekins Date: Tue, 23 Sep 2025 17:49:42 -0600 Subject: [PATCH 03/26] fail load on empty sheet Signed-off-by: John Seekins --- ice_scrapers/spreadsheet_load.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py index 893dbd6..5819d5e 100644 --- a/ice_scrapers/spreadsheet_load.py +++ b/ice_scrapers/spreadsheet_load.py @@ -63,6 +63,7 @@ def _download_sheet(keep_sheet: bool = True) -> Tuple[polars.DataFrame, str]: df = polars.read_excel( drop_empty_rows=True, has_header=False, + raise_if_empty=True, # because we're manually defining the header... read_options={"skip_rows": 7, "column_names": facility_sheet_header}, sheet_name=f"Facilities FY{cur_year}", From ce87a69b86afc9b1ed84fb66b99304624c16314d Mon Sep 17 00:00:00 2001 From: John Seekins Date: Tue, 23 Sep 2025 18:18:35 -0600 Subject: [PATCH 04/26] rough pass at adding vera data Signed-off-by: John Seekins --- ice_scrapers/spreadsheet_load.py | 24 ++++++------ ice_scrapers/vera_data.py | 67 ++++++++++++++++++++++++-------- main.py | 16 +++++++- 3 files changed, 76 insertions(+), 31 deletions(-) diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py index 5819d5e..d0495be 100644 --- a/ice_scrapers/spreadsheet_load.py +++ b/ice_scrapers/spreadsheet_load.py @@ -28,7 +28,7 @@ filename = f"{SCRIPT_DIR}{os.sep}detentionstats.xlsx" -def _download_sheet(keep_sheet: bool = True) -> Tuple[polars.DataFrame, str]: +def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tuple[polars.DataFrame, str]: """Download the detention stats sheet from ice.gov""" resp = session.get(base_xlsx_url, timeout=120) resp.raise_for_status() @@ -50,16 +50,16 @@ def _download_sheet(keep_sheet: bool = True) -> Tuple[polars.DataFrame, str]: actual_link = link["href"] # this seems like tracking into the future... cur_year = year - logger.debug("Found sheet at: %s", actual_link) - logger.info("Downloading detention stats sheet from %s", actual_link) - resp = session.get(actual_link, timeout=120, stream=True) - size = len(resp.content) - with open(filename, "wb") as f: - for chunk in resp.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - logger.debug("Wrote %s byte sheet to %s", size, filename) + if force_download or not os.path.exists(filename): + logger.info("Downloading detention stats sheet from %s", actual_link) + resp = session.get(actual_link, timeout=120, stream=True) + size = len(resp.content) + with open(filename, "wb") as f: + for chunk in resp.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + logger.debug("Wrote %s byte sheet to %s", size, filename) df = polars.read_excel( drop_empty_rows=True, has_header=False, @@ -74,8 +74,8 @@ def _download_sheet(keep_sheet: bool = True) -> Tuple[polars.DataFrame, str]: return df, actual_link -def load_sheet(keep_sheet: bool = True) -> dict: - df, sheet_url = _download_sheet(keep_sheet) +def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict: + df, sheet_url = _download_sheet(keep_sheet, force_download) """Convert the detentionstats sheet data into something we can update our facilities with""" results: dict = {} # occassionally a phone number shows up in weird places in the spreadsheet. diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index 80c1312..b73dd85 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -1,5 +1,7 @@ +import copy import os import polars +from schemas import facility_schema from utils import ( logger, session, @@ -13,20 +15,51 @@ filename = f"{SCRIPT_DIR}{os.sep}vera_facilities.csv" -def collect_vera_facility_data(facilities_data: dict, keep_csv: bool = True) -> dict: - res = session.get(base_url, timeout=120, stream=True) - res.raise_for_status() - size = len(res.content) - with open(filename, "wb") as f: - for chunk in res.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - logger.debug("Wrote %s byte sheet to %s", size, filename) - df = polars.read_csv( - has_header=True, - raise_if_empty=True, - source=filename, - use_pyarrow=True, - ) - logger.info(df) - exit(1) +def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, force_download: bool = True) -> dict: + if force_download or not os.path.exists(filename): + res = session.get(base_url, timeout=120, stream=True) + res.raise_for_status() + size = len(res.content) + with open(filename, "wb") as f: + for chunk in res.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + logger.debug("Wrote %s byte sheet to %s", size, filename) + df = polars.read_csv(has_header=True, raise_if_empty=True, source=filename, use_pyarrow=True) + logger.debug("Extracted data: %s", df) + """ + We retrieve the following columns + detention_facility_code, detention_facility_name, latitude, longitude, city, state, type_detailed, type_grouped + + None of the data Vera provides on a facility is more accurate than data we already have, so the logic + here should be _purely_ "if not exists, add". + """ + skipped_count = 0 + for row in df.iter_rows(named=True): + found = False + facility_name = row["detention_facility_name"] + for k, v in facilities_data["facilities"].items(): + if ( + v["name"].upper() == facility_name.upper() + and v["address"]["administrative_area"].upper() == row["state"].upper() + and v["address"]["locality"].upper() == row["city"].upper() + ): + logger.debug(" Found matching facility %s...skipping", v["name"]) + skipped_count += 1 + found = True + break + if not found: + facilities_data["facilities"][facility_name] = copy.deepcopy(facility_schema) + facilities_data["facilities"][facility_name]["source_urls"].append(base_url) + facilities_data["facilities"][facility_name]["name"] = facility_name + facilities_data["facilities"][facility_name]["address"]["administrative_area"] = row["state"] + facilities_data["facilities"][facility_name]["address"]["locality"] = row["city"] + facilities_data["facilities"][facility_name]["address"]["country"] = "United States" + facilities_data["facilities"][facility_name]["osm"]["latitude"] = row["latitude"] + facilities_data["facilities"][facility_name]["osm"]["longitude"] = row["longitude"] + facilities_data["facilities"][facility_name]["facility_type"]["id"] = row["type_detailed"] + facilities_data["facilities"][facility_name]["facility_type"]["group"] = row["type_grouped"] + logger.debug(" Skipped %s facilities", skipped_count) + if not keep_sheet: + os.unlink(filename) + return facilities_data diff --git a/main.py b/main.py index aea722f..1699586 100644 --- a/main.py +++ b/main.py @@ -101,6 +101,16 @@ def main() -> None: action="store_true", help="Add another column on export for OpenStreetMap debugging details and redirects", ) + parser.add_argument( + "--skip-downloads", + action="store_true", + help="Skip downloading sheet data", + ) + parser.add_argument( + "--delete-sheets", + action="store_true", + help="Remove any sheets we downloaded", + ) args = parser.parse_args() if args.debug: @@ -124,9 +134,11 @@ def main() -> None: exit(1) if args.scrape: - facilities_data = collect_vera_facility_data(facilities_data) - facilities = load_sheet() + facilities = load_sheet(keep_sheet=not args.delete_sheets, force_download=not args.skip_downloads) facilities_data["facilities"] = copy.deepcopy(facilities) + facilities_data = collect_vera_facility_data( + facilities_data, keep_sheet=not args.delete_sheets, force_download=not args.skip_downloads + ) facilities_data = scrape_facilities(facilities_data) field_offices = scrape_field_offices() facilities_data = merge_field_offices(facilities_data, field_offices) From f7c00b42c6c711010f891f182f388b49dcfdd941 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 14:57:31 -0600 Subject: [PATCH 05/26] more Vera matching and store vera facility ID Signed-off-by: John Seekins --- default_data.py | 22 +++++++++++++++++++- ice_scrapers/general.py | 2 +- ice_scrapers/vera_data.py | 44 +++++++++++++++++++++++++++++++++++++++ schemas.py | 1 + 4 files changed, 67 insertions(+), 2 deletions(-) diff --git a/default_data.py b/default_data.py index ad1a35c..b0381d0 100644 --- a/default_data.py +++ b/default_data.py @@ -70,6 +70,7 @@ "https://www.ice.gov/detention-facilities?page=0&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/baker-county-facility", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -131,6 +132,7 @@ "total": 1.916666666666666, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -200,6 +202,7 @@ "https://www.ice.gov/detention-facilities?page=4&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/northwest-ice-processing-center-nwipc", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -261,6 +264,7 @@ "total": 2.4642857142857095, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -326,6 +330,7 @@ "total": 5.038690476190489, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -395,6 +400,7 @@ "https://www.ice.gov/detention-facilities?page=3&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/grayson-county-detention-center", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -460,6 +466,7 @@ "total": 16.732142857143007, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -525,6 +532,7 @@ "total": 20.55952380952385, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -594,6 +602,7 @@ "https://www.ice.gov/detention-facilities?page=5&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/san-luis-regional-detention-center", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -608,7 +617,7 @@ }, "address_str": "409 FM 1144,KARNES CITY,TX,78118", "facility_type": { - "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees \u2013 typically these are operated by private contractors pursuant to their agreements with local governments.", + "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees. Typically these are operated by private contractors pursuant to their agreements with local governments.", "expanded_name": "Dedicated Intergovernmental Service Agreement", "id": "DIGSA", }, @@ -663,6 +672,7 @@ "https://www.ice.gov/detention-facilities?page=3&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/karnes-county-ipc", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -728,6 +738,7 @@ "https://www.ice.gov/detention-facilities?page=1&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/delaney-hall-detention-facility", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -793,6 +804,7 @@ "total": 28.62202380952395, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -862,6 +874,7 @@ "https://www.ice.gov/detention-facilities?page=4&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/moshannon-valley-processing-center", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -919,6 +932,7 @@ "total": 13.041666666666726, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -984,6 +998,7 @@ "https://www.ice.gov/detention-facilities?page=0&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/butler-county-sheriffs-office", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1053,6 +1068,7 @@ "https://www.ice.gov/detention-facilities?page=4&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/phelps-county-jail", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1122,6 +1138,7 @@ "https://www.ice.gov/detention-facilities?page=1&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/laredo-detention-center", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1187,6 +1204,7 @@ "https://www.ice.gov/detention-facilities?page=1&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/fort-bliss-detention-facility", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1235,6 +1253,7 @@ "https://www.ice.gov/detention-facilities?page=4&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/naval-station-guantanamo-bay", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1296,6 +1315,7 @@ "total": 1.5208333333333308, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py index d1cd78f..dba34ce 100644 --- a/ice_scrapers/general.py +++ b/ice_scrapers/general.py @@ -14,7 +14,7 @@ def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = Tr facilities_data = copy.deepcopy(facilities_schema) facilities = load_sheet(keep_sheet, force_download) facilities_data["facilities"] = copy.deepcopy(facilities) - facilities_data = collect_vera_facility_data(facilities_data, keep_sheet=keep_sheet, force_download=force_download) + facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download) facilities_data = scrape_facilities(facilities_data) field_offices = scrape_field_offices() facilities_data = merge_field_offices(facilities_data, field_offices) diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index 14e764d..0ec3d36 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -22,6 +22,47 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: {"match": "Adams County", "replace": "Adams County Courthouse", "city": "Ritzville"}, {"match": "Lemon Creek, Juneau,AK", "replace": "Lemon Creek Correctional Facility", "city": "Juneau"}, {"match": "Dept Of Corrections-Hagatna", "replace": "Department of Corrections Hagatna", "city": "Hagatna"}, + {"match": "Essex Co. Jail, Middleton", "replace": "Essex County Jail", "city": "Middleton"}, + {"match": "Etowah County Jail (AL)", "replace": "Etowah County Jail", "city": "Gadsden"}, + {"match": "Fairfax Co Jail", "replace": "Fairfax County Jail", "city": "Fairfax"}, + { + "match": "Ft Lauderdale Behavor Hlth Ctr", + "replace": "Fort Lauderdale Behavioral Health Center", + "city": "Oakland Park", + }, + {"match": "Marion Correctional Inst.", "replace": "Marion Correctional Institution", "city": "Ocala"}, + {"match": "Florida St. Pris.", "replace": "Florida State Prison", "city": "Raiford"}, + {"match": "Dade Correctional Inst", "replace": "Dade Correctional Institution", "city": "Florida City"}, + {"match": "Franklin County Jail, VT", "replace": "Franklin County Jail", "city": "Saint Albans"}, + {"match": "Frederick County Det. Cen", "replace": "Frederick County Detention Center", "city": "Frederick"}, + {"match": "Freeborn County Jail, MN", "replace": "Freeborn County Jail", "city": "Albert Lea"}, + {"match": "Fremont County Jail, CO", "replace": "Fremont County Jail", "city": "Canon City"}, + {"match": "Fremont County Jail, WY", "replace": "Fremont County Jail", "city": "Lander"}, + {"match": "Grand Forks County Correc", "replace": "Grand Forks County Corrections", "city": "Grand Forks"}, + {"match": "Grand Forks Co. Juvenile", "replace": "Grand Forks County Juvenile", "city": "Grand Forks"}, + {"match": "Haile Det. Center", "replace": "Haile Detention Center", "city": "Caldwell"}, + {"match": "Hampden Co.House Of Corr.", "replace": "Hampden County House of Corrections", "city": "Ludlow"}, + { + "match": "Henderson County Det. Fac.", + "replace": "Henderson County Detention Facility", + "city": "Hendersonville", + }, + {"match": "Hel District Custody", "replace": "Helena District Custody", "city": "Helena"}, + {"match": "Houston Contract Det.Fac.", "replace": "Houston Contract Detention Facility", "city": "Houston"}, + {"match": "Howard County Det Cntr", "replace": "Howard County Detention Center", "city": "Jessup"}, + {"match": "In Dept. Of Corrections", "replace": "Indiana Department of Corrections", "city": "Indianapolis"}, + {"match": "Beth Israel Hospital, Manhattan", "replace": "Beth Israel Hospital Manhattan", "city": "New York"}, + {"match": "Kent Co.,Grand Rapids,MI", "replace": "Kent County Jail", "city": "Grand Rapids"}, + {"match": "Kern County Jail (Lerdo)", "replace": "Kern County Jail", "city": "Bakersfield"}, + {"match": "Lackawana Cnty Jail, PA", "replace": "Lackawana County Jail", "city": "Scranton"}, + {"match": "Las Colinas Women Det Fac", "replace": "Las Colinas Women's Detention Facility", "city": "Santee"}, + {"match": "Lawrence Co. Jail, SD", "replace": "Lawrence County Jail", "city": "Deadwood"}, + {"match": "Lehigh County Jail, PA", "replace": "Lehigh County Jail", "city": "Allentown"}, + {"match": "Macomb Co.Mt.Clemens,MI.", "replace": "Macomb County Jail", "city": "Mount Clemens"}, + {"match": "Bwater St Hosp Bridgewate", "replace": "Bridgewater State Hospital", "city": "Bridgewater"}, + {"match": "Meade Co. Jail, SD", "replace": "Meade County Jail", "city": "Sturgis"}, + {"match": "Mecklenburg (NC) Co Jail", "replace": "Mecklenburg County Jail", "city": "Charlotte"}, + {"match": "Mountrail Co. Jail, ND", "replace": "Mountrail County Jail", "city": "Stanley"}, ] fixed = False for m in matches: @@ -74,6 +115,7 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f logger.debug(" Found matching facility %s...", v["name"]) facilities_data["facilities"][k]["osm"]["latitude"] = row["latitude"] facilities_data["facilities"][k]["osm"]["longitude"] = row["longitude"] + facilities_data["facilities"][k]["vera_id"] = row["detention_facility_code"] matched_count += 1 found = True break @@ -84,10 +126,12 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f facilities_data["facilities"][addr_str]["address"]["administrative_area"] = row["state"] facilities_data["facilities"][addr_str]["address"]["locality"] = row["city"] facilities_data["facilities"][addr_str]["address"]["country"] = "United States" + facilities_data["facilities"][addr_str]["address_str"] = addr_str facilities_data["facilities"][addr_str]["osm"]["latitude"] = row["latitude"] facilities_data["facilities"][addr_str]["osm"]["longitude"] = row["longitude"] facilities_data["facilities"][addr_str]["facility_type"]["id"] = row["type_detailed"] facilities_data["facilities"][addr_str]["facility_type"]["group"] = row["type_grouped"] + facilities_data["facilities"][addr_str]["vera_id"] = row["detention_facility_code"] logger.info( " Found %s facilities: Skipped %s, Matched %s, corrected names on %s", df.height, diff --git a/schemas.py b/schemas.py index e3fb676..84f8ca2 100644 --- a/schemas.py +++ b/schemas.py @@ -94,6 +94,7 @@ "total": 0, }, "source_urls": [], + "vera_id": "", "wikipedia": { "page_url": "", "search_query": "", From 739e6f7ec4f6fdbbe8d9827130178670f7f45ff2 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 15:10:35 -0600 Subject: [PATCH 06/26] fix city occasionally, too Signed-off-by: John Seekins --- ice_scrapers/vera_data.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index 0ec3d36..a5b44f5 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -63,6 +63,12 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: {"match": "Meade Co. Jail, SD", "replace": "Meade County Jail", "city": "Sturgis"}, {"match": "Mecklenburg (NC) Co Jail", "replace": "Mecklenburg County Jail", "city": "Charlotte"}, {"match": "Mountrail Co. Jail, ND", "replace": "Mountrail County Jail", "city": "Stanley"}, + { + "match": "Saipan Department Of Corrections", + "replace": "SAIPAN DEPARTMENT OF CORRECTIONS (SUSUPE)", + "city": "Saipan", + }, + # MPSIPAN,Saipan Department Of Corrections,15.156223,145.703679,Saipan,MP,USMS IGA,Federal ] fixed = False for m in matches: @@ -73,6 +79,20 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: return name, fixed +def _vera_city_fixes(city: str, state: str) -> Tuple[str, bool]: + """There are a few cases where getting a state match requires some munging""" + matches = [ + {"match": "Saipan", "replace": "Susupe, Saipan", "city": "MP"}, + ] + fixed = False + for m in matches: + if m["match"] == city and m["city"] == state: + fixed = True + city = m["replace"] + break + return city, fixed + + def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, force_download: bool = True) -> dict: logger.info("Collecting and extracting data from vera.org facility data...") if force_download or not os.path.exists(filename): @@ -95,22 +115,23 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f """ matched_count = 0 skipped_count = 0 - fixed_names = 0 + fixed = 0 for row in df.iter_rows(named=True): if not row["state"] or not row["city"]: logger.warning(" Skipping Vera row with missing values: %s", row) skipped_count += 1 continue found = False - facility_name, fixed = _vera_name_fixes(row["detention_facility_name"], row["city"]) - if fixed: - fixed_names += 1 - addr_str = f"{facility_name},{row['city']},{row['state']},United States" + facility_name, fixed_name = _vera_name_fixes(row["detention_facility_name"], row["city"]) + city, fixed_city = _vera_city_fixes(row["city"], row["state"]) + if fixed_name or fixed_city: + fixed += 1 + addr_str = f"{facility_name},{city},{row['state']},United States" for k, v in facilities_data["facilities"].items(): if ( v["name"].upper() == facility_name.upper() and v["address"]["administrative_area"].upper() == row["state"].upper() - and v["address"]["locality"].upper() == row["city"].upper() + and v["address"]["locality"].upper() == city.upper() ): logger.debug(" Found matching facility %s...", v["name"]) facilities_data["facilities"][k]["osm"]["latitude"] = row["latitude"] @@ -124,7 +145,7 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f facilities_data["facilities"][addr_str]["source_urls"].append(base_url) facilities_data["facilities"][addr_str]["name"] = facility_name facilities_data["facilities"][addr_str]["address"]["administrative_area"] = row["state"] - facilities_data["facilities"][addr_str]["address"]["locality"] = row["city"] + facilities_data["facilities"][addr_str]["address"]["locality"] = city facilities_data["facilities"][addr_str]["address"]["country"] = "United States" facilities_data["facilities"][addr_str]["address_str"] = addr_str facilities_data["facilities"][addr_str]["osm"]["latitude"] = row["latitude"] @@ -137,7 +158,7 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f df.height, skipped_count, matched_count, - fixed_names, + fixed, ) if not keep_sheet: os.unlink(filename) From c6fe71f0e0b56ac5631848ac8118a54f929e5107 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 15:14:32 -0600 Subject: [PATCH 07/26] one more match Signed-off-by: John Seekins --- ice_scrapers/vera_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index a5b44f5..b087197 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -68,7 +68,7 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: "replace": "SAIPAN DEPARTMENT OF CORRECTIONS (SUSUPE)", "city": "Saipan", }, - # MPSIPAN,Saipan Department Of Corrections,15.156223,145.703679,Saipan,MP,USMS IGA,Federal + {"match": "Sitka City Jail, Sitka AK", "replace": "Sitka City Jail", "city": "Sitka"}, ] fixed = False for m in matches: From b48a3cd556332ebf6e3790db47b8ec374b6954f4 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 17:30:13 -0600 Subject: [PATCH 08/26] fix typo Signed-off-by: John Seekins --- ice_scrapers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py index ee01064..c21e9b2 100644 --- a/ice_scrapers/__init__.py +++ b/ice_scrapers/__init__.py @@ -92,7 +92,7 @@ }, "Unknown": { "expanded_name": "Unknown", - "description": "A facility who's type could not be identified", + "description": "A facility whose type could not be identified", }, "SPC": { "expanded_name": "Service Processing Center", From 8a13953a7232246650d2d2fa2008e0251415e40c Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 18:06:12 -0600 Subject: [PATCH 09/26] increase matching Signed-off-by: John Seekins --- ice_scrapers/__init__.py | 5 +++++ ice_scrapers/facilities_scraper.py | 4 ++++ ice_scrapers/spreadsheet_load.py | 21 +++++++++++++-------- ice_scrapers/utils.py | 15 +++++++++++++++ ice_scrapers/vera_data.py | 22 ++++++++++++++++++++++ 5 files changed, 59 insertions(+), 8 deletions(-) diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py index c21e9b2..4a2ea16 100644 --- a/ice_scrapers/__init__.py +++ b/ice_scrapers/__init__.py @@ -70,6 +70,10 @@ "expanded_name": "Dedicated Intergovernmental Service Agreement", "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees – typically these are operated by private contractors pursuant to their agreements with local governments.", }, + "DOD": { + "expanded_name": "Department of Defence", + "description": "Department of Defence facilities - Often Army bases", + }, "Family": { "expanded_name": "Family", "description": "A facility in which families are able to remain together while awaiting their proceedings", @@ -158,6 +162,7 @@ from .utils import ( # noqa: E402 get_ice_scrape_pages, # noqa: F401 repair_locality, # noqa: F401 + repair_name, # noqa: F401 repair_street, # noqa: F401 repair_zip, # noqa: F401 update_facility, # noqa: F401 diff --git a/ice_scrapers/facilities_scraper.py b/ice_scrapers/facilities_scraper.py index cbe5d97..7e8fa6e 100644 --- a/ice_scrapers/facilities_scraper.py +++ b/ice_scrapers/facilities_scraper.py @@ -50,6 +50,10 @@ def scrape_facilities(facilities_data: dict) -> dict: if cleaned: addr["locality"] = locality facility["_repaired_record"] = True + name, cleaned = repair_locality(facility["name"], addr["locality"]) + if cleaned: + facility["name"] = name + facility["_repaired_record"] = True full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper() if not facility["address_str"]: facility["address_str"] = full_address diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py index b169b9a..e3ea2a6 100644 --- a/ice_scrapers/spreadsheet_load.py +++ b/ice_scrapers/spreadsheet_load.py @@ -1,21 +1,22 @@ from bs4 import BeautifulSoup import copy import datetime -import os -import polars -import re -from schemas import ( - facility_schema, - field_office_schema, -) from ice_scrapers import ( facility_sheet_header, ice_facility_types, ice_inspection_types, repair_locality, + repair_name, repair_street, repair_zip, ) +import os +import polars +import re +from schemas import ( + facility_schema, + field_office_schema, +) from typing import Tuple from utils import ( logger, @@ -74,6 +75,7 @@ def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tup def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict: + logger.info("Collecting initial facility data from %s", base_xlsx_url) df, sheet_url = _download_sheet(keep_sheet, force_download) """Convert the detentionstats sheet data into something we can update our facilities with""" results: dict = {} @@ -93,6 +95,9 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict: details["phone"] = match.group(1) details["_repaired_record"] = True locality, cleaned = repair_locality(row["City"], row["State"]) + if cleaned: + details["_repaired_record"] = True + name, cleaned = repair_name(row["Name"], row["City"]) if cleaned: details["_repaired_record"] = True full_address = ",".join([street, locality, row["State"], zcode]).upper() @@ -100,7 +105,7 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict: details["address"]["locality"] = locality details["address"]["postal_code"] = zcode details["address"]["street"] = street - details["name"] = row["Name"] + details["name"] = name """ population statistics diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index d6d846e..7aeb800 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -128,6 +128,21 @@ def repair_street(street: str, locality: str = "") -> Tuple[str, bool]: return street, cleaned +def repair_name(name: str, locality: str) -> Tuple[str, bool]: + """Even facility names are occasionally bad""" + matches = [ + {"match": "ALEXANDRIA STAGING FACILI", "replace": "Alexandria Staging Facility", "locality": "ALEXANDRIA"}, + {"match": "ORANGE COUNTY JAIL (NY)", "replace": "ORANGE COUNTY JAIL", "locality": "GOSHEN"}, + ] + cleaned = False + for m in matches: + if m["match"] == name and m["locality"] == locality: + name = m["replace"] + cleaned = True + break + return name, cleaned + + def repair_zip(zip_code: int, locality: str) -> Tuple[str, bool]: """ Excel does a cool thing where it strips leading 0s diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index b087197..26daa94 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -1,4 +1,5 @@ import copy +from ice_scrapers import ice_facility_types import os import polars from schemas import facility_schema @@ -69,6 +70,22 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: "city": "Saipan", }, {"match": "Sitka City Jail, Sitka AK", "replace": "Sitka City Jail", "city": "Sitka"}, + {"match": "Leavenworth USP", "replace": "Leavenworth US Penitentiary", "city": "Leavenworth"}, + {"match": "Limestone County Jail", "replace": "Limestone County Detention Center", "city": "Groesbeck"}, + {"match": "FCI Berlin", "replace": "Berlin Fed. Corr. Inst.", "city": "Berlin"}, + {"match": "Nassau Co Correc Center", "replace": "Nassau County Correctional Center", "city": "East Meadow"}, + {"match": "Riverside Reg Jail", "replace": "Riverside Regional Jail", "city": "Hopewell"}, + {"match": "T Don Hutto Residential Center", "replace": "T Don Hutto Detention Center", "city": "Taylor"}, + {"match": "Desert View", "replace": "Desert View Annex", "city": "Adelanto"}, + {"match": "Alamance Co. Det. Facility", "replace": "Alamance County Detention Facility", "city": "Graham"}, + {"match": "Hall County Sheriff", "replace": "Hall County Department of Corrections", "city": "Grand Island"}, + {"match": "Hall County Sheriff", "replace": "Hall County Department of Corrections", "city": "Grand Island"}, + { + "match": "Dallas County Jail-Lew Sterrett", + "replace": "Dallas County Jail - Lew Sterrett Justice Center", + "city": "Dallas", + }, + {"match": "Hardin Co Jail", "replace": "Hardin County Jail", "city": "Eldora"}, ] fixed = False for m in matches: @@ -153,6 +170,11 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f facilities_data["facilities"][addr_str]["facility_type"]["id"] = row["type_detailed"] facilities_data["facilities"][addr_str]["facility_type"]["group"] = row["type_grouped"] facilities_data["facilities"][addr_str]["vera_id"] = row["detention_facility_code"] + ft_details = ice_facility_types.get(row["type_detailed"], {}) + if ft_details: + facilities_data["facilities"][addr_str]["facility_type"]["description"] = ft_details["description"] + facilities_data["facilities"][addr_str]["facility_type"]["expanded_name"] = ft_details["expanded_name"] + logger.info( " Found %s facilities: Skipped %s, Matched %s, corrected names on %s", df.height, From 70455039626f7cdfd942aa614b8c335308140f50 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 18:31:17 -0600 Subject: [PATCH 10/26] even more matching Signed-off-by: John Seekins --- ice_scrapers/utils.py | 1 + ice_scrapers/vera_data.py | 41 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index 7aeb800..9e423a3 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -133,6 +133,7 @@ def repair_name(name: str, locality: str) -> Tuple[str, bool]: matches = [ {"match": "ALEXANDRIA STAGING FACILI", "replace": "Alexandria Staging Facility", "locality": "ALEXANDRIA"}, {"match": "ORANGE COUNTY JAIL (NY)", "replace": "ORANGE COUNTY JAIL", "locality": "GOSHEN"}, + {"match": "NORTH LAKE CORRECTIONAL F", "replace": "NORTH LAKE CORRECTIONAL FACILITY", "locality": "Baldwin"}, ] cleaned = False for m in matches: diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index 26daa94..8ef496c 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -39,10 +39,15 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: {"match": "Freeborn County Jail, MN", "replace": "Freeborn County Jail", "city": "Albert Lea"}, {"match": "Fremont County Jail, CO", "replace": "Fremont County Jail", "city": "Canon City"}, {"match": "Fremont County Jail, WY", "replace": "Fremont County Jail", "city": "Lander"}, - {"match": "Grand Forks County Correc", "replace": "Grand Forks County Corrections", "city": "Grand Forks"}, - {"match": "Grand Forks Co. Juvenile", "replace": "Grand Forks County Juvenile", "city": "Grand Forks"}, + { + "match": "Grand Forks County Correc", + "replace": "Grand Forks County Correctional Facility", + "city": "Grand Forks", + }, + {"match": "Grand Forks Co. Juvenile", "replace": "Grand Forks County Juvenile Facility", "city": "Grand Forks"}, {"match": "Haile Det. Center", "replace": "Haile Detention Center", "city": "Caldwell"}, {"match": "Hampden Co.House Of Corr.", "replace": "Hampden County House of Corrections", "city": "Ludlow"}, + {"match": "Eloy Federal Contract Fac", "replace": "Eloy Federal Contract Facility", "city": "Eloy"}, { "match": "Henderson County Det. Fac.", "replace": "Henderson County Detention Facility", @@ -86,6 +91,38 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: "city": "Dallas", }, {"match": "Hardin Co Jail", "replace": "Hardin County Jail", "city": "Eldora"}, + {"match": "Washington County Jail", "replace": "Washington County Detention Center", "city": "Fayetteville"}, + {"match": "Robert A Deyton Detention Fac", "replace": "Robert A Deyton Detention Facility", "city": "Lovejoy"}, + {"match": "Anchorage Jail", "replace": "Anchorage Correctional Complex", "city": "Anchorage"}, + {"match": "Douglas Co. Wisconsin", "replace": "Douglas County", "city": "Superior"}, + { + "match": "Imperial Regional Adult Det Fac", + "replace": "Imperial Regional Detention Facility", + "city": "Calexico", + }, + {"match": "Erie County Jail, PA", "replace": "Erie County Jail", "city": "Erie"}, + {"match": "NW ICE Processing Ctr", "replace": "Northwest ICE Processing Center", "city": "Tacoma"}, + {"match": "Richwood Cor Center", "replace": "Richwood Correctional Center", "city": "Monroe"}, + {"match": "Krome North SPC", "replace": "Krome North Service Processing Center", "city": "Miami"}, + {"match": "Calhoun Co., Battle Cr,MI", "replace": "Calhoun County Correctional Center", "city": "Battle Creek"}, + {"match": "Dodge County Jail, Juneau", "replace": "Dodge County Jail", "city": "Juneau"}, + {"match": "Kandiyohi Co. Jail", "replace": "Kandiyohi County Jail", "city": "Willmar"}, + { + "match": "California City Corrections Center", + "replace": "California City Correctional Center", + "city": "California City", + }, + {"match": "Plymouth Co Cor Facilty", "replace": "Plymouth County Correctional Facility", "city": "Plymouth"}, + {"match": "Otero Co Processing Center", "replace": "Otero County Processing Center", "city": "Chaparral"}, + {"match": "Strafford Co Dept Of Corr", "replace": "Strafford County Corrections", "city": "Dover"}, + {"match": "Madison Co. Jail, MS.", "replace": "Madison County Jail", "city": "Canton"}, + { + "match": "South Texas Fam Residential Center", + "replace": "Dilley Immigration Processing Center", + "city": "Dilley", + }, + {"match": "Tulsa County Jail", "replace": "Tulsa County Jail (David L. Moss Justice Ctr)", "city": "Tulsa"}, + {"match": "Kenton Co Detention Ctr", "replace": "Kenton County Jail", "city": "Covington"}, ] fixed = False for m in matches: From 219d991edd72318aa262f4ceab7188acab3dca5e Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 18:32:00 -0600 Subject: [PATCH 11/26] another facility type match Signed-off-by: John Seekins --- ice_scrapers/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py index 4a2ea16..b5c712e 100644 --- a/ice_scrapers/__init__.py +++ b/ice_scrapers/__init__.py @@ -74,6 +74,10 @@ "expanded_name": "Department of Defence", "description": "Department of Defence facilities - Often Army bases", }, + "FAMILY": { + "expanded_name": "Family", + "description": "A facility in which families are able to remain together while awaiting their proceedings", + }, "Family": { "expanded_name": "Family", "description": "A facility in which families are able to remain together while awaiting their proceedings", From e8c287d52b7b9c615602c5e6fc27cbd4fa3c1a49 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 22:33:49 -0600 Subject: [PATCH 12/26] even more matching Signed-off-by: John Seekins --- ice_scrapers/utils.py | 34 +++++++++++++++++++++++++++++- ice_scrapers/vera_data.py | 44 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index 9e423a3..bdc1825 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -133,7 +133,39 @@ def repair_name(name: str, locality: str) -> Tuple[str, bool]: matches = [ {"match": "ALEXANDRIA STAGING FACILI", "replace": "Alexandria Staging Facility", "locality": "ALEXANDRIA"}, {"match": "ORANGE COUNTY JAIL (NY)", "replace": "ORANGE COUNTY JAIL", "locality": "GOSHEN"}, - {"match": "NORTH LAKE CORRECTIONAL F", "replace": "NORTH LAKE CORRECTIONAL FACILITY", "locality": "Baldwin"}, + {"match": "NORTH LAKE CORRECTIONAL F", "replace": "NORTH LAKE CORRECTIONAL FACILITY", "locality": "BALDWIN"}, + {"match": "PHELPS COUNTY JAIL (MO)", "replace": "Phelps County Jail", "locality": "ROLLA"}, + { + "match": "PENNINGTON COUNTY JAIL (SOUTH DAKOTA)", + "replace": "PENNINGTON COUNTY JAIL", + "locality": "RAPID CITY", + }, + { + "match": "CORR. CTR OF NORTHWEST OHIO", + "replace": "CORRECTIONS CENTER OF NORTHWEST OHIO", + "locality": "STRYKER", + }, + { + "match": "FOLKSTON D RAY ICE PROCES", + "replace": "D. RAY JAMES CORRECTIONAL INSTITUTION", + "locality": "FOLKSTON", + }, + {"match": "COLLIER COUNTY NAPLES JAIL CENTER", "replace": "COLLIER COUNTY JAIL", "locality": "NAPLES"}, + { + "match": "IAH SECURE ADULT DETENTION FACILITY (POLK)", + "replace": "IAM SECURE ADULT DET. FACILITY", + "locality": "LIVINGSTON", + }, + {"match": "CIMMARRON CORR FACILITY", "replace": "CIMMARRON CORRECTIONAL FACILITY", "locality": "CUSHING"}, + {"match": "ORANGE COUNTY JAIL (FL)", "replace": "ORANGE COUNTY JAIL", "locality": "ORLANDO"}, + {"match": "CLARK COUNTY JAIL (IN)", "replace": "CLARK COUNTY JAIL", "locality": "JEFFERSONVILLE"}, + {"match": "PRINCE EDWARD COUNTY (FARMVILLE)", "replace": "ICA - FARMVILLE", "locality": "FARMVILLE"}, + {"match": "PHELPS COUNTY JAIL (NE)", "replace": "PHELPS COUNTY JAIL", "locality": "HOLDREGE"}, + { + "match": "WASHINGTON COUNTY JAIL (PURGATORY CORRECTIONAL FAC", + "replace": "WASHINGTON COUNTY JAIL", + "locality": "HURRICANE", + }, ] cleaned = False for m in matches: diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index 8ef496c..0bece3c 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -36,7 +36,7 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: {"match": "Dade Correctional Inst", "replace": "Dade Correctional Institution", "city": "Florida City"}, {"match": "Franklin County Jail, VT", "replace": "Franklin County Jail", "city": "Saint Albans"}, {"match": "Frederick County Det. Cen", "replace": "Frederick County Detention Center", "city": "Frederick"}, - {"match": "Freeborn County Jail, MN", "replace": "Freeborn County Jail", "city": "Albert Lea"}, + {"match": "Freeborn County Jail, MN", "replace": "Freeborn Adult Detention Center", "city": "Albert Lea"}, {"match": "Fremont County Jail, CO", "replace": "Fremont County Jail", "city": "Canon City"}, {"match": "Fremont County Jail, WY", "replace": "Fremont County Jail", "city": "Lander"}, { @@ -123,6 +123,40 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: }, {"match": "Tulsa County Jail", "replace": "Tulsa County Jail (David L. Moss Justice Ctr)", "city": "Tulsa"}, {"match": "Kenton Co Detention Ctr", "replace": "Kenton County Jail", "city": "Covington"}, + {"match": "Pennington County Jail SD", "replace": "Pennington County Jail", "city": "Rapid City"}, + {"match": "Denver Contract Det. Fac.", "replace": "Denver Contract Detention Facility", "city": "Aurora"}, + { + "match": "Corrections Center of NW Ohio", + "replace": "Corrections Center of Northwest Ohio", + "city": "Stryker", + }, + {"match": "Grayson County Detention Center", "replace": "Grayson County Jail", "city": "Leitchfield"}, + {"match": "Chippewa Co, SSM", "replace": "Chippewa County SSM", "city": "Sault Sainte Marie"}, + {"match": "Florence SPC", "replace": "Florence Service Processing Center", "city": "Florence"}, + {"match": "D. Ray James Prison", "replace": "D. Ray James Correctional Institution", "city": "Folkston"}, + {"match": "Collier County Sheriff", "replace": "Collier County Jail", "city": "Naples"}, + {"match": "Oldham County Jail", "replace": "Oldham County Detention Center", "city": "La Grange"}, + {"match": "Salt Lake County Jail", "replace": "Salt Lake County Metro Jail", "city": "Salt Lake City"}, + {"match": "Annex Folkston IPC", "replace": "Folkston Annex IPC", "city": "Folkston"}, + { + "match": "Northwest State Correctional Ctr.", + "replace": "Northwest State Correctional Center", + "city": "Swanton", + }, + {"match": "Basile Detention Center", "replace": "South Louisiana ICE Processing Center", "city": "Basile"}, + {"match": "New Hanover Co Det Center", "replace": "New Hanover County Jail", "city": "Castle Hayne"}, + {"match": "Bluebonnet Det Fclty", "replace": "Bluebonnet Detention Facility", "city": "Anson"}, + {"match": "San Luis Regional Det Center", "replace": "San Luis Regional Detention Center", "city": "San Luis"}, + {"match": "Buffalo SPC", "replace": "Buffalo Service Processing Center", "city": "Batavia"}, + {"match": "Laurel County Corrections", "replace": "Laurel County Correctional Center", "city": "London"}, + {"match": "Coastal Bend Det. Facility", "replace": "Coastal Bend Detention Facility", "city": "Robstown"}, + {"match": "Winn Corr Institute", "replace": "Winn Correctional Center", "city": "Winnfield"}, + {"match": "Elizabeth Contract D.F.", "replace": "Elizabeth Contract Detention Faciilty", "city": "Elizabeth"}, + { + "match": "Chittenden Reg. Cor. Facility", + "replace": "Chittenden Regional Correctional Facility", + "city": "South Burlington", + }, ] fixed = False for m in matches: @@ -137,6 +171,7 @@ def _vera_city_fixes(city: str, state: str) -> Tuple[str, bool]: """There are a few cases where getting a state match requires some munging""" matches = [ {"match": "Saipan", "replace": "Susupe, Saipan", "city": "MP"}, + {"match": "Sault Sainte Marie", "replace": "Sault Ste Marie", "city": "MP"}, ] fixed = False for m in matches: @@ -159,11 +194,18 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f f.write(chunk) logger.debug("Wrote %s byte sheet to %s", size, filename) df = polars.read_csv(has_header=True, raise_if_empty=True, source=filename, use_pyarrow=True) + if df.is_empty(): + raise ValueError("Empty CSV loaded somehow! %s", df) + # first step to removing duplicates is easy + df = df.unique() logger.debug("Extracted data: %s", df) """ We retrieve the following columns detention_facility_code, detention_facility_name, latitude, longitude, city, state, type_detailed, type_grouped + There are definitely rows that are _essentially_ duplicates, but aren't actually duplicates? + A single facility (based on the latitude/longitudes) will show up with multiple names in this dataset + None of the data Vera provides on a facility is more accurate than data we already have, so the logic here should be _purely_ "if not exists, add". """ From f8c1473974ccd793ed6edd9a6fef12261cd3a8c7 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Wed, 24 Sep 2025 22:48:21 -0600 Subject: [PATCH 13/26] possibly the last of the matches Signed-off-by: John Seekins --- ice_scrapers/utils.py | 1 + ice_scrapers/vera_data.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index bdc1825..85575ae 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -166,6 +166,7 @@ def repair_name(name: str, locality: str) -> Tuple[str, bool]: "replace": "WASHINGTON COUNTY JAIL", "locality": "HURRICANE", }, + {"match": "ETOWAH COUNTY JAIL (ALABAMA)", "replace": "ETOWAH COUNTY JAIL", "locality": "GADSDEN"}, ] cleaned = False for m in matches: diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index 0bece3c..f04b510 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -157,6 +157,27 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: "replace": "Chittenden Regional Correctional Facility", "city": "South Burlington", }, + { + "match": "NW Regional Corrections Center", + "replace": "Northwest Regional Corrections Center", + "city": "Crookston", + }, + { + "match": "Lasalle ICE Processing Center", + "replace": "Central Louisiana ICE Processing Center (CLIPC)", + "city": "Jena", + }, + { + "match": "La Salle Co Regional Det. Center", + "replace": "La Salle County Regional Detention Center", + "city": "Encinal", + }, + { + "match": "Hancock Co Pub Sfty Cplx", + "replace": "Hancock County Public Safety Complex", + "city": "Bay St. Louis", + }, + {"match": "Brooks County Jail (Contract)", "replace": "Brooks County Jail", "city": "Falfurrias"}, ] fixed = False for m in matches: @@ -196,7 +217,7 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f df = polars.read_csv(has_header=True, raise_if_empty=True, source=filename, use_pyarrow=True) if df.is_empty(): raise ValueError("Empty CSV loaded somehow! %s", df) - # first step to removing duplicates is easy + # first step to removing duplicates is easy, but unlikely to actually filter anything df = df.unique() logger.debug("Extracted data: %s", df) """ From cded7f9dfd440b91e8ec2b68b3890fa6195bab53 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Thu, 25 Sep 2025 07:36:01 -0600 Subject: [PATCH 14/26] actually add vera url when we update records Signed-off-by: John Seekins --- ice_scrapers/vera_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index f04b510..1887910 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -254,6 +254,7 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f facilities_data["facilities"][k]["osm"]["latitude"] = row["latitude"] facilities_data["facilities"][k]["osm"]["longitude"] = row["longitude"] facilities_data["facilities"][k]["vera_id"] = row["detention_facility_code"] + facilities_data["facilities"][k]["source_urls"].append(base_url) matched_count += 1 found = True break From 886b907873eabd17b9a406f9f1dc55b0690cf5fc Mon Sep 17 00:00:00 2001 From: John Seekins Date: Thu, 25 Sep 2025 07:44:18 -0600 Subject: [PATCH 15/26] only one job needs facility_sheet_header, keep it with that job Signed-off-by: John Seekins --- ice_scrapers/__init__.py | 32 ----------------------------- ice_scrapers/spreadsheet_load.py | 35 ++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py index b5c712e..17a1a64 100644 --- a/ice_scrapers/__init__.py +++ b/ice_scrapers/__init__.py @@ -4,38 +4,6 @@ may call them """ -# extracted ADP sheet header list 2025-09-07 -facility_sheet_header = [ - "Name", - "Address", - "City", - "State", - "Zip", - "AOR", - "Type Detailed", - "Male/Female", - "FY25 ALOS", - "Level A", - "Level B", - "Level C", - "Level D", - "Male Crim", - "Male Non-Crim", - "Female Crim", - "Female Non-Crim", - "ICE Threat Level 1", - "ICE Threat Level 2", - "ICE Threat Level 3", - "No ICE Threat Level", - "Mandatory", - "Guaranteed Minimum", - "Last Inspection Type", - "Last Inspection End Date", - "Pending FY25 Inspection", - "Last Inspection Standard", - "Last Final Rating", -] - ice_inspection_types = { # found in https://www.ice.gov/foia/odo-facility-inspections "ODO": "Office of Detention Oversight", diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py index e3ea2a6..1303122 100644 --- a/ice_scrapers/spreadsheet_load.py +++ b/ice_scrapers/spreadsheet_load.py @@ -2,7 +2,6 @@ import copy import datetime from ice_scrapers import ( - facility_sheet_header, ice_facility_types, ice_inspection_types, repair_locality, @@ -27,6 +26,38 @@ base_xlsx_url = "https://www.ice.gov/detain/detention-management" filename = f"{SCRIPT_DIR}{os.sep}detentionstats.xlsx" +# extracted ADP sheet header list 2025-09-07 +facility_sheet_header = [ + "Name", + "Address", + "City", + "State", + "Zip", + "AOR", + "Type Detailed", + "Male/Female", + "FY25 ALOS", + "Level A", + "Level B", + "Level C", + "Level D", + "Male Crim", + "Male Non-Crim", + "Female Crim", + "Female Non-Crim", + "ICE Threat Level 1", + "ICE Threat Level 2", + "ICE Threat Level 3", + "No ICE Threat Level", + "Mandatory", + "Guaranteed Minimum", + "Last Inspection Type", + "Last Inspection End Date", + "Pending FY25 Inspection", + "Last Inspection Standard", + "Last Final Rating", +] + def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tuple[polars.DataFrame, str]: """Download the detention stats sheet from ice.gov""" @@ -154,9 +185,9 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict: "last_rating": row["Last Final Rating"], } details["source_urls"].append(sheet_url) - # details["field_office"] = self.field_offices["field_offices"][area_of_responsibility[row["AOR"]]] details["field_office"] = copy.deepcopy(field_office_schema) details["field_office"]["id"] = row["AOR"] details["address_str"] = full_address results[full_address] = details + logger.info(" Loaded %s facilties", len(results.keys())) return results From 23886dcffd5a1b9ecc6c4de9a5c026ac3ea0123b Mon Sep 17 00:00:00 2001 From: John Seekins Date: Thu, 25 Sep 2025 08:57:16 -0600 Subject: [PATCH 16/26] update group mappings Signed-off-by: John Seekins --- ice_scrapers/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py index 17a1a64..e00fa86 100644 --- a/ice_scrapers/__init__.py +++ b/ice_scrapers/__init__.py @@ -17,10 +17,10 @@ "Dedicated": ["DIGSA", "CDF", "SPC"], "Federal": ["BOF", "USMSIGA", "USMS IGA", "USMS CDF", "DOD", "MOC"], "Hold/Staging": ["Hold", "Staging"], - "Family/Youth": ["Family", "Juvenile"], + "Family/Youth": ["Family", "Juvenile", "FAMILY"], "Medical": ["Hospital"], "Hotel": ["Hotel"], - "Other/Unknown": ["Other", "Unknown"], + "Other/Unknown": ["Other", "Unknown", "Pending"], } # extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx 2025-09-07 From e5e8ad65a3be4d3a2455752e865b071f354499ff Mon Sep 17 00:00:00 2001 From: John Seekins Date: Mon, 29 Sep 2025 21:59:38 -0600 Subject: [PATCH 17/26] more vera matching, skip enrichment when it's only vera Signed-off-by: John Seekins --- enrichers/general.py | 3 +++ ice_scrapers/general.py | 2 +- ice_scrapers/utils.py | 2 ++ ice_scrapers/vera_data.py | 6 ++++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/enrichers/general.py b/enrichers/general.py index c7730c5..6e7acd3 100644 --- a/enrichers/general.py +++ b/enrichers/general.py @@ -36,6 +36,9 @@ def _enrich_facility(facility_data: tuple) -> tuple: """enrich a single facility""" facility_id, facility = facility_data facility_name = facility["name"] + if len(facility["source_urls"]) == 1 and "vera-institute/ice-detention-trends" in facility["source_urls"][0]: + logger.debug(" Skipping enrichment of facility with only vera.org data: %s", facility["name"]) + return facility_id, facility logger.info("Enriching facility %s...", facility_name) enriched_facility = copy.deepcopy(facility) diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py index dba34ce..8267e85 100644 --- a/ice_scrapers/general.py +++ b/ice_scrapers/general.py @@ -14,8 +14,8 @@ def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = Tr facilities_data = copy.deepcopy(facilities_schema) facilities = load_sheet(keep_sheet, force_download) facilities_data["facilities"] = copy.deepcopy(facilities) - facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download) facilities_data = scrape_facilities(facilities_data) + facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download) field_offices = scrape_field_offices() facilities_data = merge_field_offices(facilities_data, field_offices) facilities_data = insert_additional_facilities(facilities_data) diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index 4c73a4c..b289113 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -195,6 +195,8 @@ def repair_name(name: str, locality: str) -> Tuple[str, bool]: "locality": "HURRICANE", }, {"match": "ETOWAH COUNTY JAIL (ALABAMA)", "replace": "ETOWAH COUNTY JAIL", "locality": "GADSDEN"}, + {"match": "BURLEIGH COUNTY", "replace": "BURLEIGH COUNTY JAIL", "locality": "BISMARCK"}, + {"match": "NELSON COLEMAN CORRECTION", "replace": "NELSON COLEMAN CORRECTIONS CENTER", "locality": "KILLONA"}, ] cleaned = False for m in matches: diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index f9097e5..ca43311 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -178,6 +178,12 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: "city": "Bay St. Louis", }, {"match": "Brooks County Jail (Contract)", "replace": "Brooks County Jail", "city": "Falfurrias"}, + {"match": "Burleigh Co. Jail, ND", "replace": "Burleigh County Jail", "city": "Bismarck"}, + {"match": "Lubbock County Jail", "replace": "Lubbock County Detention Center", "city": "Lubbock"}, + {"match": "Montgomery County Jail", "replace": "Montgomery Ice Processing Center", "city": "Conroe"}, + {"match": "Sebastian County Det Cnt", "replace": "Sebastian County Detention Center", "city": "Smith"}, + {"match": "Atlanta U.S. Pen.", "replace": "FCI Atlanta", "city": "Atlanta"}, + {"match": "Clinton County Corr. Fac.", "replace": "Clinton County Correctional Facility", "city": "McElhattan"}, ] fixed = False for m in matches: From 5c09f41239e004e76f43540cfb81197214768539 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Mon, 29 Sep 2025 22:05:43 -0600 Subject: [PATCH 18/26] more matching Signed-off-by: John Seekins --- ice_scrapers/utils.py | 5 +++++ ice_scrapers/vera_data.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index b289113..6f06ca8 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -197,6 +197,11 @@ def repair_name(name: str, locality: str) -> Tuple[str, bool]: {"match": "ETOWAH COUNTY JAIL (ALABAMA)", "replace": "ETOWAH COUNTY JAIL", "locality": "GADSDEN"}, {"match": "BURLEIGH COUNTY", "replace": "BURLEIGH COUNTY JAIL", "locality": "BISMARCK"}, {"match": "NELSON COLEMAN CORRECTION", "replace": "NELSON COLEMAN CORRECTIONS CENTER", "locality": "KILLONA"}, + { + "match": "CIMMARRON CORRECTIONAL FACILITY", + "replace": "CIMARRON CORRECTIONAL FACILITY", + "locality": "CUSHING", + }, ] cleaned = False for m in matches: diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index ca43311..834e7f8 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -183,7 +183,8 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: {"match": "Montgomery County Jail", "replace": "Montgomery Ice Processing Center", "city": "Conroe"}, {"match": "Sebastian County Det Cnt", "replace": "Sebastian County Detention Center", "city": "Smith"}, {"match": "Atlanta U.S. Pen.", "replace": "FCI Atlanta", "city": "Atlanta"}, - {"match": "Clinton County Corr. Fac.", "replace": "Clinton County Correctional Facility", "city": "McElhattan"}, + {"match": "Clinton County Corr. Fac.", "replace": "Clinton County Correctional Facility", "city": "Mcelhattan"}, + {"match": "Freeborn County Jail, MN", "replace": "Freeborn County Adult Detention", "city": "Albert Lea"}, ] fixed = False for m in matches: From fd6cd400de6e40014f976cfa7a37ae27d0bc6e66 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Mon, 29 Sep 2025 22:08:07 -0600 Subject: [PATCH 19/26] track repairs with vera data Signed-off-by: John Seekins --- ice_scrapers/vera_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index 834e7f8..d607166 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -268,6 +268,8 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f facilities_data["facilities"][k]["osm"]["longitude"] = row["longitude"] facilities_data["facilities"][k]["vera_id"] = row["detention_facility_code"] facilities_data["facilities"][k]["source_urls"].append(base_url) + if fixed_name or fixed_city: + facilities_data["facilities"][k]["_repaired_record"] = True matched_count += 1 found = True break @@ -287,6 +289,8 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f if ft_details: facilities_data["facilities"][addr_str]["facility_type"]["description"] = ft_details["description"] facilities_data["facilities"][addr_str]["facility_type"]["expanded_name"] = ft_details["expanded_name"] + if fixed_name or fixed_city: + facilities_data["facilities"][addr_str]["_repaired_record"] = True logger.info( " Found %s facilities: Skipped %s, Matched %s, corrected names on %s", From c135509ca22edb93d85a9d1f68aea579efd9179a Mon Sep 17 00:00:00 2001 From: John Seekins Date: Tue, 30 Sep 2025 14:13:55 -0600 Subject: [PATCH 20/26] slightly more matching Signed-off-by: John Seekins --- ice_scrapers/utils.py | 7 ++++++- ice_scrapers/vera_data.py | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index 6f06ca8..e806fe0 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -198,10 +198,15 @@ def repair_name(name: str, locality: str) -> Tuple[str, bool]: {"match": "BURLEIGH COUNTY", "replace": "BURLEIGH COUNTY JAIL", "locality": "BISMARCK"}, {"match": "NELSON COLEMAN CORRECTION", "replace": "NELSON COLEMAN CORRECTIONS CENTER", "locality": "KILLONA"}, { - "match": "CIMMARRON CORRECTIONAL FACILITY", + "match": "CIMMARRON CORR FACILITY", "replace": "CIMARRON CORRECTIONAL FACILITY", "locality": "CUSHING", }, + { + "match": "IAM SECURE ADULT DET. FACILITY", + "replace": "IAH SECURE ADULT DET. FACILITY", + "locality": "LIVINGSTON", + }, ] cleaned = False for m in matches: diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py index d607166..95ad0b2 100644 --- a/ice_scrapers/vera_data.py +++ b/ice_scrapers/vera_data.py @@ -181,10 +181,14 @@ def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: {"match": "Burleigh Co. Jail, ND", "replace": "Burleigh County Jail", "city": "Bismarck"}, {"match": "Lubbock County Jail", "replace": "Lubbock County Detention Center", "city": "Lubbock"}, {"match": "Montgomery County Jail", "replace": "Montgomery Ice Processing Center", "city": "Conroe"}, - {"match": "Sebastian County Det Cnt", "replace": "Sebastian County Detention Center", "city": "Smith"}, + {"match": "Sebastian County Det Cnt", "replace": "Sebastian County Detention Center", "city": "Fort Smith"}, {"match": "Atlanta U.S. Pen.", "replace": "FCI Atlanta", "city": "Atlanta"}, {"match": "Clinton County Corr. Fac.", "replace": "Clinton County Correctional Facility", "city": "Mcelhattan"}, - {"match": "Freeborn County Jail, MN", "replace": "Freeborn County Adult Detention", "city": "Albert Lea"}, + { + "match": "Freeborn County Jail, MN", + "replace": "Freeborn County Adult Detention Center", + "city": "Albert Lea", + }, ] fixed = False for m in matches: From 27c021363e77ae54998a7cb724dbecbb1b938b3f Mon Sep 17 00:00:00 2001 From: John Seekins Date: Tue, 30 Sep 2025 18:09:55 -0600 Subject: [PATCH 21/26] slightly restrict zip generation Signed-off-by: John Seekins --- ice_scrapers/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index e806fe0..df7885c 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -224,7 +224,8 @@ def repair_zip(zip_code: int, locality: str) -> Tuple[str, bool]: """ zcode = str(zip_code) cleaned = False - if len(zcode) < 5: + # don't replace an empty zip with all 0s + if 0 < len(zcode) < 5: # pad any prefix zeros = "0" * (5 - len(zcode)) zcode = f"{zeros}{zcode}" From 0c25f4ac902522e8605cbde2b763554de815c4a6 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Tue, 30 Sep 2025 20:38:16 -0600 Subject: [PATCH 22/26] slightly nicer typing Signed-off-by: John Seekins --- file_utils.py | 6 ++++-- ice_scrapers/custom_facilities.py | 2 +- ice_scrapers/facilities_scraper.py | 6 ++---- schemas.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/file_utils.py b/file_utils.py index 329f01f..f9a536c 100644 --- a/file_utils.py +++ b/file_utils.py @@ -23,12 +23,14 @@ def export_to_file( match file_type: case "xlsx": with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb: - writer.write_excel(workbook=wb, include_header=True, autofit=True) + _ = writer.write_excel(workbook=wb, include_header=True, autofit=True) case "csv": with open(full_name, "w", newline="", encoding="utf-8") as f_out: writer.write_csv(file=f_out, include_header=True) case "parquet": writer.write_parquet(full_name, use_pyarrow=True) + case _: + logger.warning("Invalid dataframe output type %s", file_type) elif file_type == "json": with open(full_name, "w", encoding="utf-8") as f_out: json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str) @@ -103,7 +105,7 @@ def print_summary(facilities_data: dict) -> None: false_positives = 0 errors = 0 for facility in facilities_data["facilities"].values(): - query = facility.get("wikipedia", {}).get("search_query", "") + query: str = facility.get("wikipedia", {}).get("search_query", "") if "REJECTED" in query: false_positives += 1 elif "ERROR" in query: diff --git a/ice_scrapers/custom_facilities.py b/ice_scrapers/custom_facilities.py index 6854830..67b32c9 100644 --- a/ice_scrapers/custom_facilities.py +++ b/ice_scrapers/custom_facilities.py @@ -3,7 +3,7 @@ """ Handle manually discovered/managed facilities """ -custom_facilities = { +custom_facilities: dict = { "2309 North Highway 83,McCook,NE,69001": { "_repaired_record": False, "address": { diff --git a/ice_scrapers/facilities_scraper.py b/ice_scrapers/facilities_scraper.py index fa3f0d8..8c45cdb 100644 --- a/ice_scrapers/facilities_scraper.py +++ b/ice_scrapers/facilities_scraper.py @@ -83,7 +83,7 @@ def scrape_facilities(facilities_data: dict) -> dict: return facilities_data -def _scrape_updated(url: str): +def _scrape_updated(url: str) -> datetime.datetime: """ Scrape url to get "last updated" time Is specifically oriented around ice.gov facility pages @@ -97,7 +97,7 @@ def _scrape_updated(url: str): response.raise_for_status() except Exception as e: logger.error(" Error parsing %s: %s", url, e) - return [] + return datetime.datetime.strptime(default_timestamp, timestamp_format) soup = BeautifulSoup(response.content, "html.parser") times = soup.findAll("time") if not times: @@ -181,7 +181,6 @@ def _scrape_page(page_url: str) -> list: facilities.append(facility_data) logger.info(" Extracted %s facilities from page", len(facilities)) - return facilities @@ -194,7 +193,6 @@ def _find_facility_patterns(container): r"([A-Z][^|]+(?:\|[^|]+)?)\s*([A-Z][^A-Z]*Field Office)", r"([^-]+)\s*-\s*([A-Z][^A-Z]*Field Office)", ] - text_content = container.get_text() for pattern in facility_patterns: diff --git a/schemas.py b/schemas.py index 84f8ca2..8eae6e9 100644 --- a/schemas.py +++ b/schemas.py @@ -1,7 +1,7 @@ import copy import datetime -facilities_schema = { +facilities_schema: dict = { "enrich_runtime": 0, "facilities": {}, "scrape_runtime": 0, @@ -107,7 +107,7 @@ # enrichment response object -enrich_resp_schema = { +enrich_resp_schema: dict = { "cleaned_name": "", "details": {}, "enrichment_type": "", From 95642da8d0c44d32daf34eacbe25dc1ff6cbbf05 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Fri, 3 Oct 2025 17:21:41 -0600 Subject: [PATCH 23/26] add skip-vera switch Signed-off-by: John Seekins --- ice_scrapers/general.py | 5 +++-- main.py | 9 ++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py index 8267e85..b519085 100644 --- a/ice_scrapers/general.py +++ b/ice_scrapers/general.py @@ -10,12 +10,13 @@ from schemas import facilities_schema -def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True) -> dict: +def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False) -> dict: facilities_data = copy.deepcopy(facilities_schema) facilities = load_sheet(keep_sheet, force_download) facilities_data["facilities"] = copy.deepcopy(facilities) facilities_data = scrape_facilities(facilities_data) - facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download) + if not skip_vera: + facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download) field_offices = scrape_field_offices() facilities_data = merge_field_offices(facilities_data, field_offices) facilities_data = insert_additional_facilities(facilities_data) diff --git a/main.py b/main.py index f36598d..2496839 100644 --- a/main.py +++ b/main.py @@ -102,6 +102,11 @@ def main() -> None: action="store_true", help="Remove any sheets we downloaded", ) + parser.add_argument( + "--skip-vera", + action="store_true", + help="Don't collect vera.org data", + ) args = parser.parse_args() if args.debug: @@ -124,7 +129,9 @@ def main() -> None: if args.scrape: facilities_data = facilities_scrape_wrapper( - keep_sheet=not args.delete_sheets, force_download=not args.skip_downloads + keep_sheet=not args.delete_sheets, + force_download=not args.skip_downloads, + skip_vera=args.skip_vera, ) elif args.load_existing: facilities_data = copy.deepcopy(default_data.facilities_data) From d158481c858f5137c37705609de48dbd7a1a779a Mon Sep 17 00:00:00 2001 From: John Seekins Date: Fri, 3 Oct 2025 18:24:51 -0600 Subject: [PATCH 24/26] add tool to find facilities not getting a vera.org ID Signed-off-by: John Seekins --- tools/find_missing_vera.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tools/find_missing_vera.py diff --git a/tools/find_missing_vera.py b/tools/find_missing_vera.py new file mode 100644 index 0000000..773e313 --- /dev/null +++ b/tools/find_missing_vera.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import json +import os +import pprint +import subprocess + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + + +def _find_files(directory: str) -> os.DirEntry[str]: + results = [] + with os.scandir(directory) as d: + for f in d: + if f.name.startswith("ice_detention_facilities") and f.name.endswith(".json"): + results.append(f) + final = results[0] + for f in results: + if f.stat().st_mtime > final.stat().st_mtime: + final = f + return final + + +def main() -> None: + res = subprocess.run(["git", "rev-parse", "--show-toplevel"], capture_output=True) + root_dir = [f for f in res.stdout.decode("utf-8").split("\n")][0] + newest_file = _find_files(root_dir) + with open(newest_file.path, "r") as f_in: + data = json.load(f_in) + missing_vera = {k: v for k, v in data["facilities"].items() if not v.get("vera_id", "")} + pprint.pprint(missing_vera, indent=1, compact=True) + print(f"Found {len(missing_vera.keys())} facilities with a missing vera.org ID") + + +if __name__ == "__main__": + main() From 2466b23b5647887db711aad47b40499a6b7ec8a7 Mon Sep 17 00:00:00 2001 From: John Seekins Date: Sun, 5 Oct 2025 16:17:05 -0600 Subject: [PATCH 25/26] don't overwrite existing lat/long/etc. unless we have new values Signed-off-by: John Seekins --- enrichers/general.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/enrichers/general.py b/enrichers/general.py index 6e7acd3..3c14163 100644 --- a/enrichers/general.py +++ b/enrichers/general.py @@ -50,9 +50,15 @@ def _enrich_facility(facility_data: tuple) -> tuple: enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "") enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "") enriched_facility["wikidata"]["search_query"] = wd_res.get("search_query_steps", "") - enriched_facility["osm"]["latitude"] = osm_res.get("details", {}).get("latitude", osm.default_coords["latitude"]) - enriched_facility["osm"]["longitude"] = osm_res.get("details", {}).get("longitude", osm.default_coords["longitude"]) - enriched_facility["osm"]["url"] = osm_res.get("url", "") + lat = osm_res.get("details", {}).get("latitude", None) + long = osm_res.get("details", {}).get("longitude", None) + if lat: + enriched_facility["osm"]["latitude"] = lat + if long: + enriched_facility["osm"]["longitude"] = lat + url = osm_res.get("url", None) + if url: + enriched_facility["osm"]["url"] = url enriched_facility["osm"]["search_query"] = osm_res.get("search_query_steps", "") logger.debug(enriched_facility) From 88fbd69920a6b5920ed14badbbf1bf3d1c302a4f Mon Sep 17 00:00:00 2001 From: John Seekins Date: Sun, 5 Oct 2025 16:18:25 -0600 Subject: [PATCH 26/26] only enrich if there's data to enrich Signed-off-by: John Seekins --- enrichers/general.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/enrichers/general.py b/enrichers/general.py index 3c14163..9edd669 100644 --- a/enrichers/general.py +++ b/enrichers/general.py @@ -46,9 +46,13 @@ def _enrich_facility(facility_data: tuple) -> tuple: wd_res = wikidata.Wikidata(facility_name=facility_name).search() osm = openstreetmap.OpenStreetMap(facility_name=facility_name, address=facility.get("address", {})) osm_res = osm.search() - enriched_facility["wikipedia"]["page_url"] = wiki_res.get("url", "") + url = wiki_res.get("url", None) + if url: + enriched_facility["wikipedia"]["page_url"] = url enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "") - enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "") + url = wd_res.get("url", None) + if url: + enriched_facility["wikidata"]["page_url"] = url enriched_facility["wikidata"]["search_query"] = wd_res.get("search_query_steps", "") lat = osm_res.get("details", {}).get("latitude", None) long = osm_res.get("details", {}).get("longitude", None)