diff --git a/default_data.py b/default_data.py index ad1a35c..b0381d0 100644 --- a/default_data.py +++ b/default_data.py @@ -70,6 +70,7 @@ "https://www.ice.gov/detention-facilities?page=0&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/baker-county-facility", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -131,6 +132,7 @@ "total": 1.916666666666666, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -200,6 +202,7 @@ "https://www.ice.gov/detention-facilities?page=4&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/northwest-ice-processing-center-nwipc", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -261,6 +264,7 @@ "total": 2.4642857142857095, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -326,6 +330,7 @@ "total": 5.038690476190489, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -395,6 +400,7 @@ "https://www.ice.gov/detention-facilities?page=3&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/grayson-county-detention-center", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -460,6 +466,7 @@ "total": 16.732142857143007, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -525,6 +532,7 @@ "total": 20.55952380952385, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -594,6 +602,7 @@ "https://www.ice.gov/detention-facilities?page=5&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/san-luis-regional-detention-center", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -608,7 +617,7 @@ }, "address_str": "409 FM 1144,KARNES CITY,TX,78118", "facility_type": { - "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees \u2013 typically these are operated by private contractors pursuant to their agreements with local governments.", + "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees. Typically these are operated by private contractors pursuant to their agreements with local governments.", "expanded_name": "Dedicated Intergovernmental Service Agreement", "id": "DIGSA", }, @@ -663,6 +672,7 @@ "https://www.ice.gov/detention-facilities?page=3&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/karnes-county-ipc", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -728,6 +738,7 @@ "https://www.ice.gov/detention-facilities?page=1&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/delaney-hall-detention-facility", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -793,6 +804,7 @@ "total": 28.62202380952395, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -862,6 +874,7 @@ "https://www.ice.gov/detention-facilities?page=4&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/moshannon-valley-processing-center", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -919,6 +932,7 @@ "total": 13.041666666666726, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -984,6 +998,7 @@ "https://www.ice.gov/detention-facilities?page=0&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/butler-county-sheriffs-office", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1053,6 +1068,7 @@ "https://www.ice.gov/detention-facilities?page=4&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/phelps-county-jail", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1122,6 +1138,7 @@ "https://www.ice.gov/detention-facilities?page=1&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/laredo-detention-center", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1187,6 +1204,7 @@ "https://www.ice.gov/detention-facilities?page=1&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/fort-bliss-detention-facility", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1235,6 +1253,7 @@ "https://www.ice.gov/detention-facilities?page=4&exposed_form_display=1", "https://www.ice.gov/detain/detention-facilities/naval-station-guantanamo-bay", ], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, @@ -1296,6 +1315,7 @@ "total": 1.5208333333333308, }, "source_urls": ["https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx"], + "vera_id": "", "wikidata": {"page_url": "", "search_query": ""}, "wikipedia": {"page_url": "", "search_query": ""}, }, diff --git a/enrichers/general.py b/enrichers/general.py index c7730c5..9edd669 100644 --- a/enrichers/general.py +++ b/enrichers/general.py @@ -36,6 +36,9 @@ def _enrich_facility(facility_data: tuple) -> tuple: """enrich a single facility""" facility_id, facility = facility_data facility_name = facility["name"] + if len(facility["source_urls"]) == 1 and "vera-institute/ice-detention-trends" in facility["source_urls"][0]: + logger.debug(" Skipping enrichment of facility with only vera.org data: %s", facility["name"]) + return facility_id, facility logger.info("Enriching facility %s...", facility_name) enriched_facility = copy.deepcopy(facility) @@ -43,13 +46,23 @@ def _enrich_facility(facility_data: tuple) -> tuple: wd_res = wikidata.Wikidata(facility_name=facility_name).search() osm = openstreetmap.OpenStreetMap(facility_name=facility_name, address=facility.get("address", {})) osm_res = osm.search() - enriched_facility["wikipedia"]["page_url"] = wiki_res.get("url", "") + url = wiki_res.get("url", None) + if url: + enriched_facility["wikipedia"]["page_url"] = url enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "") - enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "") + url = wd_res.get("url", None) + if url: + enriched_facility["wikidata"]["page_url"] = url enriched_facility["wikidata"]["search_query"] = wd_res.get("search_query_steps", "") - enriched_facility["osm"]["latitude"] = osm_res.get("details", {}).get("latitude", osm.default_coords["latitude"]) - enriched_facility["osm"]["longitude"] = osm_res.get("details", {}).get("longitude", osm.default_coords["longitude"]) - enriched_facility["osm"]["url"] = osm_res.get("url", "") + lat = osm_res.get("details", {}).get("latitude", None) + long = osm_res.get("details", {}).get("longitude", None) + if lat: + enriched_facility["osm"]["latitude"] = lat + if long: + enriched_facility["osm"]["longitude"] = lat + url = osm_res.get("url", None) + if url: + enriched_facility["osm"]["url"] = url enriched_facility["osm"]["search_query"] = osm_res.get("search_query_steps", "") logger.debug(enriched_facility) diff --git a/file_utils.py b/file_utils.py index 329f01f..f9a536c 100644 --- a/file_utils.py +++ b/file_utils.py @@ -23,12 +23,14 @@ def export_to_file( match file_type: case "xlsx": with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb: - writer.write_excel(workbook=wb, include_header=True, autofit=True) + _ = writer.write_excel(workbook=wb, include_header=True, autofit=True) case "csv": with open(full_name, "w", newline="", encoding="utf-8") as f_out: writer.write_csv(file=f_out, include_header=True) case "parquet": writer.write_parquet(full_name, use_pyarrow=True) + case _: + logger.warning("Invalid dataframe output type %s", file_type) elif file_type == "json": with open(full_name, "w", encoding="utf-8") as f_out: json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str) @@ -103,7 +105,7 @@ def print_summary(facilities_data: dict) -> None: false_positives = 0 errors = 0 for facility in facilities_data["facilities"].values(): - query = facility.get("wikipedia", {}).get("search_query", "") + query: str = facility.get("wikipedia", {}).get("search_query", "") if "REJECTED" in query: false_positives += 1 elif "ERROR" in query: diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py index 55096a8..76eaa67 100644 --- a/ice_scrapers/__init__.py +++ b/ice_scrapers/__init__.py @@ -4,38 +4,6 @@ may call them """ -# extracted ADP sheet header list 2025-09-07 -facility_sheet_header = [ - "Name", - "Address", - "City", - "State", - "Zip", - "AOR", - "Type Detailed", - "Male/Female", - "FY25 ALOS", - "Level A", - "Level B", - "Level C", - "Level D", - "Male Crim", - "Male Non-Crim", - "Female Crim", - "Female Non-Crim", - "ICE Threat Level 1", - "ICE Threat Level 2", - "ICE Threat Level 3", - "No ICE Threat Level", - "Mandatory", - "Guaranteed Minimum", - "Last Inspection Type", - "Last Inspection End Date", - "Pending FY25 Inspection", - "Last Inspection Standard", - "Last Final Rating", -] - ice_inspection_types = { # found in https://www.ice.gov/foia/odo-facility-inspections "ODO": "Office of Detention Oversight", @@ -43,24 +11,65 @@ "ORSA": "Operational Review Self-Assessment", } +# extracted from https://vera-institute.files.svdcdn.com/production/downloads/dashboard_appendix.pdf 2025-09-23 +ice_facility_group_mapping = { + "Non-Dedicated": ["IGSA"], + "Dedicated": ["DIGSA", "CDF", "SPC"], + "Federal": ["BOF", "USMSIGA", "USMS IGA", "USMS CDF", "DOD", "MOC"], + "Hold/Staging": ["Hold", "Staging"], + "Family/Youth": ["Family", "Juvenile", "FAMILY"], + "Medical": ["Hospital"], + "Hotel": ["Hotel"], + "Other/Unknown": ["Other", "Unknown", "Pending"], +} + # extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx 2025-09-07 +# and https://vera-institute.files.svdcdn.com/production/downloads/dashboard_appendix.pdf 2025-09-23 ice_facility_types = { "BOP": { "expanded_name": "Federal Bureau of Prisons", "description": "A facility operated by the Federal Bureau of Prisons", }, + "CDF": { + "expanded_name": "Contract Detention Facility", + "description": "Name derived from listing at https://www.vera.org/ice-detention-trends", + }, "DIGSA": { "expanded_name": "Dedicated Intergovernmental Service Agreement", "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees – typically these are operated by private contractors pursuant to their agreements with local governments.", }, "DOD": { - "expanded_name": "Department of Defense", - "description": "Military facility", + "expanded_name": "Department of Defence", + "description": "Department of Defence facilities - Often Army bases", + }, + "FAMILY": { + "expanded_name": "Family", + "description": "A facility in which families are able to remain together while awaiting their proceedings", + }, + "Family": { + "expanded_name": "Family", + "description": "A facility in which families are able to remain together while awaiting their proceedings", + }, + "Hospital": { + "expanded_name": "Hospital", + "description": "A medical facility", }, "IGSA": { "expanded_name": "Intergovernmental Service Agreement", "description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts for bed space via an Intergovernmental Service Agreement; or local jails used by ICE pursuant to Intergovernmental Service Agreements, which house both ICE and non-ICE detainees, typically county prisoners awaiting trial or serving short sentences, but sometimes also USMS prisoners.", }, + "Juvenile": { + "expanded_name": "Juvenile", + "description": "An IGSA facility capable of housing juveniles (separate from adults) for a temporary period of time", + }, + "Other": { + "expanded_name": "Other", + "description": "Facilities including but not limited to transportation-related facilities, hotels, and/or other facilities", + }, + "Unknown": { + "expanded_name": "Unknown", + "description": "A facility whose type could not be identified", + }, "SPC": { "expanded_name": "Service Processing Center", "description": "A facility owned by the government and staffed by a combination of federal and contract employees.", @@ -82,10 +91,6 @@ "expanded_name": "United States Marshals Service Contract Detention Facility", "description": "Name derived from listing at https://www.vera.org/ice-detention-trends", }, - "CDF": { - "expanded_name": "Contract Detention Facility", - "description": "Name derived from listing at https://www.vera.org/ice-detention-trends", - }, "Staging": { "description": "Some facilities in the ICE spreadsheet are marked 'Staging'. Hard to determine why.", "expanded_name": "Staging", @@ -129,6 +134,7 @@ from .utils import ( # noqa: E402 get_ice_scrape_pages, # noqa: F401 repair_locality, # noqa: F401 + repair_name, # noqa: F401 repair_street, # noqa: F401 repair_zip, # noqa: F401 special_facilities, # noqa: F401 @@ -140,5 +146,6 @@ merge_field_offices, # noqa: F401 scrape_field_offices, # noqa: F401 ) +from .vera_data import collect_vera_facility_data # noqa: F401,E402 from .custom_facilities import insert_additional_facilities # noqa: F401,E402 from .general import facilities_scrape_wrapper # noqa: F401,E402 diff --git a/ice_scrapers/custom_facilities.py b/ice_scrapers/custom_facilities.py index 6854830..67b32c9 100644 --- a/ice_scrapers/custom_facilities.py +++ b/ice_scrapers/custom_facilities.py @@ -3,7 +3,7 @@ """ Handle manually discovered/managed facilities """ -custom_facilities = { +custom_facilities: dict = { "2309 North Highway 83,McCook,NE,69001": { "_repaired_record": False, "address": { diff --git a/ice_scrapers/facilities_scraper.py b/ice_scrapers/facilities_scraper.py index 1ee50ae..8c45cdb 100644 --- a/ice_scrapers/facilities_scraper.py +++ b/ice_scrapers/facilities_scraper.py @@ -28,7 +28,7 @@ def scrape_facilities(facilities_data: dict) -> dict: logger.info("Starting to scrape ICE.gov detention facilities...") facilities_data["scraped_date"] = datetime.datetime.now(datetime.UTC) urls = get_ice_scrape_pages(base_scrape_url) - + scraped_count = 0 for page_num, url in enumerate(urls): logger.info("Scraping page %s/%s...", page_num + 1, len(urls)) try: @@ -37,6 +37,7 @@ def scrape_facilities(facilities_data: dict) -> dict: logger.error("Error scraping page %s: %s", page_num + 1, e) logger.debug("Found %s facilities on page %s", len(facilities), page_num + 1) time.sleep(1) # Be respectful to the server + scraped_count += len(facilities) for facility in facilities: facility = special_facilities(facility) addr = facility["address"] @@ -52,6 +53,10 @@ def scrape_facilities(facilities_data: dict) -> dict: if cleaned: addr["locality"] = locality facility["_repaired_record"] = True + name, cleaned = repair_locality(facility["name"], addr["locality"]) + if cleaned: + facility["name"] = name + facility["_repaired_record"] = True full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper() if not facility["address_str"]: facility["address_str"] = full_address @@ -73,12 +78,12 @@ def scrape_facilities(facilities_data: dict) -> dict: facilities_data["facilities"][facility["name"]] = facility # type: ignore [index] facilities_data["scrape_runtime"] = time.time() - start_time - logger.info("Total facilities scraped: %s", len(list(facilities_data["facilities"].keys()))) # type: ignore [attr-defined] + logger.info("Total facilities scraped: %s", scraped_count) logger.info(" Completed in %s seconds", facilities_data["scrape_runtime"]) return facilities_data -def _scrape_updated(url: str): +def _scrape_updated(url: str) -> datetime.datetime: """ Scrape url to get "last updated" time Is specifically oriented around ice.gov facility pages @@ -92,7 +97,7 @@ def _scrape_updated(url: str): response.raise_for_status() except Exception as e: logger.error(" Error parsing %s: %s", url, e) - return [] + return datetime.datetime.strptime(default_timestamp, timestamp_format) soup = BeautifulSoup(response.content, "html.parser") times = soup.findAll("time") if not times: @@ -176,7 +181,6 @@ def _scrape_page(page_url: str) -> list: facilities.append(facility_data) logger.info(" Extracted %s facilities from page", len(facilities)) - return facilities @@ -189,7 +193,6 @@ def _find_facility_patterns(container): r"([A-Z][^|]+(?:\|[^|]+)?)\s*([A-Z][^A-Z]*Field Office)", r"([^-]+)\s*-\s*([A-Z][^A-Z]*Field Office)", ] - text_content = container.get_text() for pattern in facility_patterns: diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py index 4acd906..b519085 100644 --- a/ice_scrapers/general.py +++ b/ice_scrapers/general.py @@ -1,5 +1,6 @@ import copy from ice_scrapers import ( + collect_vera_facility_data, insert_additional_facilities, load_sheet, merge_field_offices, @@ -9,11 +10,13 @@ from schemas import facilities_schema -def facilities_scrape_wrapper() -> dict: +def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False) -> dict: facilities_data = copy.deepcopy(facilities_schema) - facilities = load_sheet() + facilities = load_sheet(keep_sheet, force_download) facilities_data["facilities"] = copy.deepcopy(facilities) facilities_data = scrape_facilities(facilities_data) + if not skip_vera: + facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download) field_offices = scrape_field_offices() facilities_data = merge_field_offices(facilities_data, field_offices) facilities_data = insert_additional_facilities(facilities_data) diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py index 9356077..320d742 100644 --- a/ice_scrapers/spreadsheet_load.py +++ b/ice_scrapers/spreadsheet_load.py @@ -1,22 +1,22 @@ from bs4 import BeautifulSoup import copy import datetime -import os -import polars -import re -from schemas import ( - facility_schema, - field_office_schema, -) from ice_scrapers import ( - facility_sheet_header, ice_facility_types, ice_inspection_types, repair_locality, + repair_name, repair_street, repair_zip, special_facilities, ) +import os +import polars +import re +from schemas import ( + facility_schema, + field_office_schema, +) from typing import Tuple from utils import ( logger, @@ -27,8 +27,40 @@ base_xlsx_url = "https://www.ice.gov/detain/detention-management" filename = f"{SCRIPT_DIR}{os.sep}detentionstats.xlsx" +# extracted ADP sheet header list 2025-09-07 +facility_sheet_header = [ + "Name", + "Address", + "City", + "State", + "Zip", + "AOR", + "Type Detailed", + "Male/Female", + "FY25 ALOS", + "Level A", + "Level B", + "Level C", + "Level D", + "Male Crim", + "Male Non-Crim", + "Female Crim", + "Female Non-Crim", + "ICE Threat Level 1", + "ICE Threat Level 2", + "ICE Threat Level 3", + "No ICE Threat Level", + "Mandatory", + "Guaranteed Minimum", + "Last Inspection Type", + "Last Inspection End Date", + "Pending FY25 Inspection", + "Last Inspection Standard", + "Last Final Rating", +] -def _download_sheet(keep_sheet: bool = True) -> Tuple[polars.DataFrame, str]: + +def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tuple[polars.DataFrame, str]: """Download the detention stats sheet from ice.gov""" resp = session.get(base_xlsx_url, timeout=120) resp.raise_for_status() @@ -50,19 +82,20 @@ def _download_sheet(keep_sheet: bool = True) -> Tuple[polars.DataFrame, str]: actual_link = link["href"] # this seems like tracking into the future... cur_year = year - logger.debug("Found sheet at: %s", actual_link) - logger.info("Downloading detention stats sheet from %s", actual_link) - resp = session.get(actual_link, timeout=120, stream=True) - size = len(resp.content) - with open(filename, "wb") as f: - for chunk in resp.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - logger.debug("Wrote %s byte sheet to %s", size, filename) + if force_download or not os.path.exists(filename): + logger.info("Downloading detention stats sheet from %s", actual_link) + resp = session.get(actual_link, timeout=120, stream=True) + size = len(resp.content) + with open(filename, "wb") as f: + for chunk in resp.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + logger.debug("Wrote %s byte sheet to %s", size, filename) df = polars.read_excel( drop_empty_rows=True, has_header=False, + raise_if_empty=True, # because we're manually defining the header... read_options={"skip_rows": 7, "column_names": facility_sheet_header}, sheet_name=f"Facilities FY{cur_year}", @@ -73,8 +106,9 @@ def _download_sheet(keep_sheet: bool = True) -> Tuple[polars.DataFrame, str]: return df, actual_link -def load_sheet(keep_sheet: bool = True) -> dict: - df, sheet_url = _download_sheet(keep_sheet) +def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict: + logger.info("Collecting initial facility data from %s", base_xlsx_url) + df, sheet_url = _download_sheet(keep_sheet, force_download) """Convert the detentionstats sheet data into something we can update our facilities with""" results: dict = {} # occassionally a phone number shows up in weird places in the spreadsheet. @@ -93,13 +127,16 @@ def load_sheet(keep_sheet: bool = True) -> dict: details["phone"] = match.group(1) details["_repaired_record"] = True locality, cleaned = repair_locality(row["City"], row["State"]) + if cleaned: + details["_repaired_record"] = True + name, cleaned = repair_name(row["Name"], row["City"]) if cleaned: details["_repaired_record"] = True details["address"]["administrative_area"] = row["State"] details["address"]["locality"] = locality details["address"]["postal_code"] = zcode details["address"]["street"] = street - details["name"] = row["Name"] + details["name"] = name details = special_facilities(details) full_address = ",".join( [ @@ -157,9 +194,9 @@ def load_sheet(keep_sheet: bool = True) -> dict: "last_rating": row["Last Final Rating"], } details["source_urls"].append(sheet_url) - # details["field_office"] = self.field_offices["field_offices"][area_of_responsibility[row["AOR"]]] details["field_office"] = copy.deepcopy(field_office_schema) details["field_office"]["id"] = row["AOR"] details["address_str"] = full_address results[full_address] = details + logger.info(" Loaded %s facilties", len(results.keys())) return results diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py index aa012a0..df7885c 100644 --- a/ice_scrapers/utils.py +++ b/ice_scrapers/utils.py @@ -156,6 +156,67 @@ def repair_street(street: str, locality: str = "") -> Tuple[str, bool]: return street, cleaned +def repair_name(name: str, locality: str) -> Tuple[str, bool]: + """Even facility names are occasionally bad""" + matches = [ + {"match": "ALEXANDRIA STAGING FACILI", "replace": "Alexandria Staging Facility", "locality": "ALEXANDRIA"}, + {"match": "ORANGE COUNTY JAIL (NY)", "replace": "ORANGE COUNTY JAIL", "locality": "GOSHEN"}, + {"match": "NORTH LAKE CORRECTIONAL F", "replace": "NORTH LAKE CORRECTIONAL FACILITY", "locality": "BALDWIN"}, + {"match": "PHELPS COUNTY JAIL (MO)", "replace": "Phelps County Jail", "locality": "ROLLA"}, + { + "match": "PENNINGTON COUNTY JAIL (SOUTH DAKOTA)", + "replace": "PENNINGTON COUNTY JAIL", + "locality": "RAPID CITY", + }, + { + "match": "CORR. CTR OF NORTHWEST OHIO", + "replace": "CORRECTIONS CENTER OF NORTHWEST OHIO", + "locality": "STRYKER", + }, + { + "match": "FOLKSTON D RAY ICE PROCES", + "replace": "D. RAY JAMES CORRECTIONAL INSTITUTION", + "locality": "FOLKSTON", + }, + {"match": "COLLIER COUNTY NAPLES JAIL CENTER", "replace": "COLLIER COUNTY JAIL", "locality": "NAPLES"}, + { + "match": "IAH SECURE ADULT DETENTION FACILITY (POLK)", + "replace": "IAM SECURE ADULT DET. FACILITY", + "locality": "LIVINGSTON", + }, + {"match": "CIMMARRON CORR FACILITY", "replace": "CIMMARRON CORRECTIONAL FACILITY", "locality": "CUSHING"}, + {"match": "ORANGE COUNTY JAIL (FL)", "replace": "ORANGE COUNTY JAIL", "locality": "ORLANDO"}, + {"match": "CLARK COUNTY JAIL (IN)", "replace": "CLARK COUNTY JAIL", "locality": "JEFFERSONVILLE"}, + {"match": "PRINCE EDWARD COUNTY (FARMVILLE)", "replace": "ICA - FARMVILLE", "locality": "FARMVILLE"}, + {"match": "PHELPS COUNTY JAIL (NE)", "replace": "PHELPS COUNTY JAIL", "locality": "HOLDREGE"}, + { + "match": "WASHINGTON COUNTY JAIL (PURGATORY CORRECTIONAL FAC", + "replace": "WASHINGTON COUNTY JAIL", + "locality": "HURRICANE", + }, + {"match": "ETOWAH COUNTY JAIL (ALABAMA)", "replace": "ETOWAH COUNTY JAIL", "locality": "GADSDEN"}, + {"match": "BURLEIGH COUNTY", "replace": "BURLEIGH COUNTY JAIL", "locality": "BISMARCK"}, + {"match": "NELSON COLEMAN CORRECTION", "replace": "NELSON COLEMAN CORRECTIONS CENTER", "locality": "KILLONA"}, + { + "match": "CIMMARRON CORR FACILITY", + "replace": "CIMARRON CORRECTIONAL FACILITY", + "locality": "CUSHING", + }, + { + "match": "IAM SECURE ADULT DET. FACILITY", + "replace": "IAH SECURE ADULT DET. FACILITY", + "locality": "LIVINGSTON", + }, + ] + cleaned = False + for m in matches: + if m["match"] == name and m["locality"] == locality: + name = m["replace"] + cleaned = True + break + return name, cleaned + + def repair_zip(zip_code: int, locality: str) -> Tuple[str, bool]: """ Excel does a cool thing where it strips leading 0s @@ -163,7 +224,8 @@ def repair_zip(zip_code: int, locality: str) -> Tuple[str, bool]: """ zcode = str(zip_code) cleaned = False - if len(zcode) < 5: + # don't replace an empty zip with all 0s + if 0 < len(zcode) < 5: # pad any prefix zeros = "0" * (5 - len(zcode)) zcode = f"{zeros}{zcode}" diff --git a/ice_scrapers/vera_data.py b/ice_scrapers/vera_data.py new file mode 100644 index 0000000..95ad0b2 --- /dev/null +++ b/ice_scrapers/vera_data.py @@ -0,0 +1,308 @@ +import copy +from ice_scrapers import ice_facility_types +import os +import polars +from schemas import facility_schema +from typing import Tuple +from utils import ( + logger, + session, +) + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +# Github can aggressively rate-limit requests, so this may fail in surprising ways! +base_url = ( + "https://raw.githubusercontent.com/vera-institute/ice-detention-trends/refs/heads/main/metadata/facilities.csv" +) +filename = f"{SCRIPT_DIR}{os.sep}vera_facilities.csv" + + +def _vera_name_fixes(name: str, city: str) -> Tuple[str, bool]: + """Match Vera names with ice.gov names""" + matches = [ + {"match": "Adams County", "replace": "Adams County Courthouse", "city": "Ritzville"}, + {"match": "Lemon Creek, Juneau,AK", "replace": "Lemon Creek Correctional Facility", "city": "Juneau"}, + {"match": "Dept Of Corrections-Hagatna", "replace": "Department of Corrections Hagatna", "city": "Hagatna"}, + {"match": "Essex Co. Jail, Middleton", "replace": "Essex County Jail", "city": "Middleton"}, + {"match": "Etowah County Jail (AL)", "replace": "Etowah County Jail", "city": "Gadsden"}, + {"match": "Fairfax Co Jail", "replace": "Fairfax County Jail", "city": "Fairfax"}, + { + "match": "Ft Lauderdale Behavor Hlth Ctr", + "replace": "Fort Lauderdale Behavioral Health Center", + "city": "Oakland Park", + }, + {"match": "Marion Correctional Inst.", "replace": "Marion Correctional Institution", "city": "Ocala"}, + {"match": "Florida St. Pris.", "replace": "Florida State Prison", "city": "Raiford"}, + {"match": "Dade Correctional Inst", "replace": "Dade Correctional Institution", "city": "Florida City"}, + {"match": "Franklin County Jail, VT", "replace": "Franklin County Jail", "city": "Saint Albans"}, + {"match": "Frederick County Det. Cen", "replace": "Frederick County Detention Center", "city": "Frederick"}, + {"match": "Freeborn County Jail, MN", "replace": "Freeborn Adult Detention Center", "city": "Albert Lea"}, + {"match": "Fremont County Jail, CO", "replace": "Fremont County Jail", "city": "Canon City"}, + {"match": "Fremont County Jail, WY", "replace": "Fremont County Jail", "city": "Lander"}, + { + "match": "Grand Forks County Correc", + "replace": "Grand Forks County Correctional Facility", + "city": "Grand Forks", + }, + {"match": "Grand Forks Co. Juvenile", "replace": "Grand Forks County Juvenile Facility", "city": "Grand Forks"}, + {"match": "Haile Det. Center", "replace": "Haile Detention Center", "city": "Caldwell"}, + {"match": "Hampden Co.House Of Corr.", "replace": "Hampden County House of Corrections", "city": "Ludlow"}, + {"match": "Eloy Federal Contract Fac", "replace": "Eloy Federal Contract Facility", "city": "Eloy"}, + { + "match": "Henderson County Det. Fac.", + "replace": "Henderson County Detention Facility", + "city": "Hendersonville", + }, + {"match": "Hel District Custody", "replace": "Helena District Custody", "city": "Helena"}, + {"match": "Houston Contract Det.Fac.", "replace": "Houston Contract Detention Facility", "city": "Houston"}, + {"match": "Howard County Det Cntr", "replace": "Howard County Detention Center", "city": "Jessup"}, + {"match": "In Dept. Of Corrections", "replace": "Indiana Department of Corrections", "city": "Indianapolis"}, + {"match": "Beth Israel Hospital, Manhattan", "replace": "Beth Israel Hospital Manhattan", "city": "New York"}, + {"match": "Kent Co.,Grand Rapids,MI", "replace": "Kent County Jail", "city": "Grand Rapids"}, + {"match": "Kern County Jail (Lerdo)", "replace": "Kern County Jail", "city": "Bakersfield"}, + {"match": "Lackawana Cnty Jail, PA", "replace": "Lackawana County Jail", "city": "Scranton"}, + {"match": "Las Colinas Women Det Fac", "replace": "Las Colinas Women's Detention Facility", "city": "Santee"}, + {"match": "Lawrence Co. Jail, SD", "replace": "Lawrence County Jail", "city": "Deadwood"}, + {"match": "Lehigh County Jail, PA", "replace": "Lehigh County Jail", "city": "Allentown"}, + {"match": "Macomb Co.Mt.Clemens,MI.", "replace": "Macomb County Jail", "city": "Mount Clemens"}, + {"match": "Bwater St Hosp Bridgewate", "replace": "Bridgewater State Hospital", "city": "Bridgewater"}, + {"match": "Meade Co. Jail, SD", "replace": "Meade County Jail", "city": "Sturgis"}, + {"match": "Mecklenburg (NC) Co Jail", "replace": "Mecklenburg County Jail", "city": "Charlotte"}, + {"match": "Mountrail Co. Jail, ND", "replace": "Mountrail County Jail", "city": "Stanley"}, + { + "match": "Saipan Department Of Corrections", + "replace": "SAIPAN DEPARTMENT OF CORRECTIONS (SUSUPE)", + "city": "Saipan", + }, + {"match": "Sitka City Jail, Sitka AK", "replace": "Sitka City Jail", "city": "Sitka"}, + {"match": "Leavenworth USP", "replace": "Leavenworth US Penitentiary", "city": "Leavenworth"}, + {"match": "Limestone County Jail", "replace": "Limestone County Detention Center", "city": "Groesbeck"}, + {"match": "FCI Berlin", "replace": "Berlin Fed. Corr. Inst.", "city": "Berlin"}, + {"match": "Nassau Co Correc Center", "replace": "Nassau County Correctional Center", "city": "East Meadow"}, + {"match": "Riverside Reg Jail", "replace": "Riverside Regional Jail", "city": "Hopewell"}, + {"match": "T Don Hutto Residential Center", "replace": "T Don Hutto Detention Center", "city": "Taylor"}, + {"match": "Desert View", "replace": "Desert View Annex", "city": "Adelanto"}, + {"match": "Alamance Co. Det. Facility", "replace": "Alamance County Detention Facility", "city": "Graham"}, + {"match": "Hall County Sheriff", "replace": "Hall County Department of Corrections", "city": "Grand Island"}, + {"match": "Hall County Sheriff", "replace": "Hall County Department of Corrections", "city": "Grand Island"}, + { + "match": "Dallas County Jail-Lew Sterrett", + "replace": "Dallas County Jail - Lew Sterrett Justice Center", + "city": "Dallas", + }, + {"match": "Hardin Co Jail", "replace": "Hardin County Jail", "city": "Eldora"}, + {"match": "Washington County Jail", "replace": "Washington County Detention Center", "city": "Fayetteville"}, + {"match": "Robert A Deyton Detention Fac", "replace": "Robert A Deyton Detention Facility", "city": "Lovejoy"}, + {"match": "Anchorage Jail", "replace": "Anchorage Correctional Complex", "city": "Anchorage"}, + {"match": "Douglas Co. Wisconsin", "replace": "Douglas County", "city": "Superior"}, + { + "match": "Imperial Regional Adult Det Fac", + "replace": "Imperial Regional Detention Facility", + "city": "Calexico", + }, + {"match": "Erie County Jail, PA", "replace": "Erie County Jail", "city": "Erie"}, + {"match": "NW ICE Processing Ctr", "replace": "Northwest ICE Processing Center", "city": "Tacoma"}, + {"match": "Richwood Cor Center", "replace": "Richwood Correctional Center", "city": "Monroe"}, + {"match": "Krome North SPC", "replace": "Krome North Service Processing Center", "city": "Miami"}, + {"match": "Calhoun Co., Battle Cr,MI", "replace": "Calhoun County Correctional Center", "city": "Battle Creek"}, + {"match": "Dodge County Jail, Juneau", "replace": "Dodge County Jail", "city": "Juneau"}, + {"match": "Kandiyohi Co. Jail", "replace": "Kandiyohi County Jail", "city": "Willmar"}, + { + "match": "California City Corrections Center", + "replace": "California City Correctional Center", + "city": "California City", + }, + {"match": "Plymouth Co Cor Facilty", "replace": "Plymouth County Correctional Facility", "city": "Plymouth"}, + {"match": "Otero Co Processing Center", "replace": "Otero County Processing Center", "city": "Chaparral"}, + {"match": "Strafford Co Dept Of Corr", "replace": "Strafford County Corrections", "city": "Dover"}, + {"match": "Madison Co. Jail, MS.", "replace": "Madison County Jail", "city": "Canton"}, + { + "match": "South Texas Fam Residential Center", + "replace": "Dilley Immigration Processing Center", + "city": "Dilley", + }, + {"match": "Tulsa County Jail", "replace": "Tulsa County Jail (David L. Moss Justice Ctr)", "city": "Tulsa"}, + {"match": "Kenton Co Detention Ctr", "replace": "Kenton County Jail", "city": "Covington"}, + {"match": "Pennington County Jail SD", "replace": "Pennington County Jail", "city": "Rapid City"}, + {"match": "Denver Contract Det. Fac.", "replace": "Denver Contract Detention Facility", "city": "Aurora"}, + { + "match": "Corrections Center of NW Ohio", + "replace": "Corrections Center of Northwest Ohio", + "city": "Stryker", + }, + {"match": "Grayson County Detention Center", "replace": "Grayson County Jail", "city": "Leitchfield"}, + {"match": "Chippewa Co, SSM", "replace": "Chippewa County SSM", "city": "Sault Sainte Marie"}, + {"match": "Florence SPC", "replace": "Florence Service Processing Center", "city": "Florence"}, + {"match": "D. Ray James Prison", "replace": "D. Ray James Correctional Institution", "city": "Folkston"}, + {"match": "Collier County Sheriff", "replace": "Collier County Jail", "city": "Naples"}, + {"match": "Oldham County Jail", "replace": "Oldham County Detention Center", "city": "La Grange"}, + {"match": "Salt Lake County Jail", "replace": "Salt Lake County Metro Jail", "city": "Salt Lake City"}, + {"match": "Annex Folkston IPC", "replace": "Folkston Annex IPC", "city": "Folkston"}, + { + "match": "Northwest State Correctional Ctr.", + "replace": "Northwest State Correctional Center", + "city": "Swanton", + }, + {"match": "Basile Detention Center", "replace": "South Louisiana ICE Processing Center", "city": "Basile"}, + {"match": "New Hanover Co Det Center", "replace": "New Hanover County Jail", "city": "Castle Hayne"}, + {"match": "Bluebonnet Det Fclty", "replace": "Bluebonnet Detention Facility", "city": "Anson"}, + {"match": "San Luis Regional Det Center", "replace": "San Luis Regional Detention Center", "city": "San Luis"}, + {"match": "Buffalo SPC", "replace": "Buffalo Service Processing Center", "city": "Batavia"}, + {"match": "Laurel County Corrections", "replace": "Laurel County Correctional Center", "city": "London"}, + {"match": "Coastal Bend Det. Facility", "replace": "Coastal Bend Detention Facility", "city": "Robstown"}, + {"match": "Winn Corr Institute", "replace": "Winn Correctional Center", "city": "Winnfield"}, + {"match": "Elizabeth Contract D.F.", "replace": "Elizabeth Contract Detention Faciilty", "city": "Elizabeth"}, + { + "match": "Chittenden Reg. Cor. Facility", + "replace": "Chittenden Regional Correctional Facility", + "city": "South Burlington", + }, + { + "match": "NW Regional Corrections Center", + "replace": "Northwest Regional Corrections Center", + "city": "Crookston", + }, + { + "match": "Lasalle ICE Processing Center", + "replace": "Central Louisiana ICE Processing Center (CLIPC)", + "city": "Jena", + }, + { + "match": "La Salle Co Regional Det. Center", + "replace": "La Salle County Regional Detention Center", + "city": "Encinal", + }, + { + "match": "Hancock Co Pub Sfty Cplx", + "replace": "Hancock County Public Safety Complex", + "city": "Bay St. Louis", + }, + {"match": "Brooks County Jail (Contract)", "replace": "Brooks County Jail", "city": "Falfurrias"}, + {"match": "Burleigh Co. Jail, ND", "replace": "Burleigh County Jail", "city": "Bismarck"}, + {"match": "Lubbock County Jail", "replace": "Lubbock County Detention Center", "city": "Lubbock"}, + {"match": "Montgomery County Jail", "replace": "Montgomery Ice Processing Center", "city": "Conroe"}, + {"match": "Sebastian County Det Cnt", "replace": "Sebastian County Detention Center", "city": "Fort Smith"}, + {"match": "Atlanta U.S. Pen.", "replace": "FCI Atlanta", "city": "Atlanta"}, + {"match": "Clinton County Corr. Fac.", "replace": "Clinton County Correctional Facility", "city": "Mcelhattan"}, + { + "match": "Freeborn County Jail, MN", + "replace": "Freeborn County Adult Detention Center", + "city": "Albert Lea", + }, + ] + fixed = False + for m in matches: + if m["match"] == name and m["city"] == city: + fixed = True + name = m["replace"] + break + return name, fixed + + +def _vera_city_fixes(city: str, state: str) -> Tuple[str, bool]: + """There are a few cases where getting a state match requires some munging""" + matches = [ + {"match": "Saipan", "replace": "Susupe, Saipan", "city": "MP"}, + {"match": "Sault Sainte Marie", "replace": "Sault Ste Marie", "city": "MP"}, + ] + fixed = False + for m in matches: + if m["match"] == city and m["city"] == state: + fixed = True + city = m["replace"] + break + return city, fixed + + +def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, force_download: bool = True) -> dict: + logger.info("Collecting and extracting data from vera.org facility data...") + if force_download or not os.path.exists(filename): + res = session.get(base_url, timeout=120, stream=True) + res.raise_for_status() + size = len(res.content) + with open(filename, "wb") as f: + for chunk in res.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + logger.debug("Wrote %s byte sheet to %s", size, filename) + df = polars.read_csv(has_header=True, raise_if_empty=True, source=filename, use_pyarrow=True) + if df.is_empty(): + raise ValueError("Empty CSV loaded somehow! %s", df) + # first step to removing duplicates is easy, but unlikely to actually filter anything + df = df.unique() + logger.debug("Extracted data: %s", df) + """ + We retrieve the following columns + detention_facility_code, detention_facility_name, latitude, longitude, city, state, type_detailed, type_grouped + + There are definitely rows that are _essentially_ duplicates, but aren't actually duplicates? + A single facility (based on the latitude/longitudes) will show up with multiple names in this dataset + + None of the data Vera provides on a facility is more accurate than data we already have, so the logic + here should be _purely_ "if not exists, add". + """ + matched_count = 0 + skipped_count = 0 + fixed = 0 + for row in df.iter_rows(named=True): + if not row["state"] or not row["city"]: + logger.warning(" Skipping Vera row with missing values: %s", row) + skipped_count += 1 + continue + found = False + facility_name, fixed_name = _vera_name_fixes(row["detention_facility_name"], row["city"]) + row["name"] = facility_name + city, fixed_city = _vera_city_fixes(row["city"], row["state"]) + row["city"] = city + if fixed_name or fixed_city: + fixed += 1 + if row["name"] == "JTF Camp Six": + row["state"] = "FPO" + row["city"] = "FPO" + row["name"] = "Naval Station Guantanamo Bay (JTF Camp Six and Migrant Ops Center Main A)" + addr_str = f"{row['name']},{row['city']},{row['state']}" + for k, v in facilities_data["facilities"].items(): + if ( + v["name"].upper() == row["name"].upper() + and v["address"]["administrative_area"].upper() == row["state"].upper() + and v["address"]["locality"].upper() == row["city"].upper() + ): + logger.debug(" Found matching facility %s...", v["name"]) + facilities_data["facilities"][k]["osm"]["latitude"] = row["latitude"] + facilities_data["facilities"][k]["osm"]["longitude"] = row["longitude"] + facilities_data["facilities"][k]["vera_id"] = row["detention_facility_code"] + facilities_data["facilities"][k]["source_urls"].append(base_url) + if fixed_name or fixed_city: + facilities_data["facilities"][k]["_repaired_record"] = True + matched_count += 1 + found = True + break + if not found: + facilities_data["facilities"][addr_str] = copy.deepcopy(facility_schema) + facilities_data["facilities"][addr_str]["source_urls"].append(base_url) + facilities_data["facilities"][addr_str]["name"] = row["name"] + facilities_data["facilities"][addr_str]["address"]["administrative_area"] = row["state"] + facilities_data["facilities"][addr_str]["address"]["locality"] = row["city"] + facilities_data["facilities"][addr_str]["address_str"] = addr_str + facilities_data["facilities"][addr_str]["osm"]["latitude"] = row["latitude"] + facilities_data["facilities"][addr_str]["osm"]["longitude"] = row["longitude"] + facilities_data["facilities"][addr_str]["facility_type"]["id"] = row["type_detailed"] + facilities_data["facilities"][addr_str]["facility_type"]["group"] = row["type_grouped"] + facilities_data["facilities"][addr_str]["vera_id"] = row["detention_facility_code"] + ft_details = ice_facility_types.get(row["type_detailed"], {}) + if ft_details: + facilities_data["facilities"][addr_str]["facility_type"]["description"] = ft_details["description"] + facilities_data["facilities"][addr_str]["facility_type"]["expanded_name"] = ft_details["expanded_name"] + if fixed_name or fixed_city: + facilities_data["facilities"][addr_str]["_repaired_record"] = True + + logger.info( + " Found %s facilities: Skipped %s, Matched %s, corrected names on %s", + df.height, + skipped_count, + matched_count, + fixed, + ) + if not keep_sheet: + os.unlink(filename) + return facilities_data diff --git a/main.py b/main.py index b00b697..2496839 100644 --- a/main.py +++ b/main.py @@ -92,6 +92,21 @@ def main() -> None: action="store_true", help="Add another column on export for OpenStreetMap debugging details and redirects", ) + parser.add_argument( + "--skip-downloads", + action="store_true", + help="Skip downloading sheet data", + ) + parser.add_argument( + "--delete-sheets", + action="store_true", + help="Remove any sheets we downloaded", + ) + parser.add_argument( + "--skip-vera", + action="store_true", + help="Don't collect vera.org data", + ) args = parser.parse_args() if args.debug: @@ -113,7 +128,11 @@ def main() -> None: exit(1) if args.scrape: - facilities_data = facilities_scrape_wrapper() + facilities_data = facilities_scrape_wrapper( + keep_sheet=not args.delete_sheets, + force_download=not args.skip_downloads, + skip_vera=args.skip_vera, + ) elif args.load_existing: facilities_data = copy.deepcopy(default_data.facilities_data) logger.info( diff --git a/schemas.py b/schemas.py index e3fb676..8eae6e9 100644 --- a/schemas.py +++ b/schemas.py @@ -1,7 +1,7 @@ import copy import datetime -facilities_schema = { +facilities_schema: dict = { "enrich_runtime": 0, "facilities": {}, "scrape_runtime": 0, @@ -94,6 +94,7 @@ "total": 0, }, "source_urls": [], + "vera_id": "", "wikipedia": { "page_url": "", "search_query": "", @@ -106,7 +107,7 @@ # enrichment response object -enrich_resp_schema = { +enrich_resp_schema: dict = { "cleaned_name": "", "details": {}, "enrichment_type": "", diff --git a/tools/find_missing_vera.py b/tools/find_missing_vera.py new file mode 100644 index 0000000..773e313 --- /dev/null +++ b/tools/find_missing_vera.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import json +import os +import pprint +import subprocess + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + + +def _find_files(directory: str) -> os.DirEntry[str]: + results = [] + with os.scandir(directory) as d: + for f in d: + if f.name.startswith("ice_detention_facilities") and f.name.endswith(".json"): + results.append(f) + final = results[0] + for f in results: + if f.stat().st_mtime > final.stat().st_mtime: + final = f + return final + + +def main() -> None: + res = subprocess.run(["git", "rev-parse", "--show-toplevel"], capture_output=True) + root_dir = [f for f in res.stdout.decode("utf-8").split("\n")][0] + newest_file = _find_files(root_dir) + with open(newest_file.path, "r") as f_in: + data = json.load(f_in) + missing_vera = {k: v for k, v in data["facilities"].items() if not v.get("vera_id", "")} + pprint.pprint(missing_vera, indent=1, compact=True) + print(f"Found {len(missing_vera.keys())} facilities with a missing vera.org ID") + + +if __name__ == "__main__": + main()