Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions enrichers/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,23 @@ def _enrich_facility(facility_data: tuple) -> tuple:
wd_res = wikidata.Wikidata(facility_name=facility_name).search()
osm = openstreetmap.OpenStreetMap(facility_name=facility_name, address=facility.get("address", {}))
osm_res = osm.search()
enriched_facility["wikipedia"]["page_url"] = wiki_res.get("url", "")
url = wiki_res.get("url", None)
if url:
enriched_facility["wikipedia"]["page_url"] = url
enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "")
enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "")
url = wd_res.get("url", None)
if url:
enriched_facility["wikidata"]["page_url"] = url
enriched_facility["wikidata"]["search_query"] = wd_res.get("search_query_steps", "")
enriched_facility["osm"]["latitude"] = osm_res.get("details", {}).get("latitude", osm.default_coords["latitude"])
enriched_facility["osm"]["longitude"] = osm_res.get("details", {}).get("longitude", osm.default_coords["longitude"])
enriched_facility["osm"]["url"] = osm_res.get("url", "")
lat = osm_res.get("details", {}).get("latitude", None)
long = osm_res.get("details", {}).get("longitude", None)
if lat:
enriched_facility["osm"]["latitude"] = lat
if long:
enriched_facility["osm"]["longitude"] = lat
url = osm_res.get("url", None)
if url:
enriched_facility["osm"]["url"] = url
enriched_facility["osm"]["search_query"] = osm_res.get("search_query_steps", "")

logger.debug(enriched_facility)
Expand Down
6 changes: 4 additions & 2 deletions file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ def export_to_file(
match file_type:
case "xlsx":
with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
writer.write_excel(workbook=wb, include_header=True, autofit=True)
_ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
case "csv":
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
writer.write_csv(file=f_out, include_header=True)
case "parquet":
writer.write_parquet(full_name, use_pyarrow=True)
case _:
logger.warning("Invalid dataframe output type %s", file_type)
elif file_type == "json":
with open(full_name, "w", encoding="utf-8") as f_out:
json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
Expand Down Expand Up @@ -103,7 +105,7 @@ def print_summary(facilities_data: dict) -> None:
false_positives = 0
errors = 0
for facility in facilities_data["facilities"].values():
query = facility.get("wikipedia", {}).get("search_query", "")
query: str = facility.get("wikipedia", {}).get("search_query", "")
if "REJECTED" in query:
false_positives += 1
elif "ERROR" in query:
Expand Down
79 changes: 42 additions & 37 deletions ice_scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,75 @@
may call them
"""

# extracted ADP sheet header list 2025-09-07
facility_sheet_header = [
"Name",
"Address",
"City",
"State",
"Zip",
"AOR",
"Type Detailed",
"Male/Female",
"FY25 ALOS",
"Level A",
"Level B",
"Level C",
"Level D",
"Male Crim",
"Male Non-Crim",
"Female Crim",
"Female Non-Crim",
"ICE Threat Level 1",
"ICE Threat Level 2",
"ICE Threat Level 3",
"No ICE Threat Level",
"Mandatory",
"Guaranteed Minimum",
"Last Inspection Type",
"Last Inspection End Date",
"Pending FY25 Inspection",
"Last Inspection Standard",
"Last Final Rating",
]

ice_inspection_types = {
# found in https://www.ice.gov/foia/odo-facility-inspections
"ODO": "Office of Detention Oversight",
# found in https://ia803100.us.archive.org/16/items/6213032-ORSA-MOU-ICE/6213032-ORSA-MOU-ICE_text.pdf
"ORSA": "Operational Review Self-Assessment",
}

# extracted from https://vera-institute.files.svdcdn.com/production/downloads/dashboard_appendix.pdf 2025-09-23
ice_facility_group_mapping = {
"Non-Dedicated": ["IGSA"],
"Dedicated": ["DIGSA", "CDF", "SPC"],
"Federal": ["BOF", "USMSIGA", "USMS IGA", "USMS CDF", "DOD", "MOC"],
"Hold/Staging": ["Hold", "Staging"],
"Family/Youth": ["Family", "Juvenile", "FAMILY"],
"Medical": ["Hospital"],
"Hotel": ["Hotel"],
"Other/Unknown": ["Other", "Unknown", "Pending"],
}

# extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx 2025-09-07
ice_facility_types = {
"BOP": {
"expanded_name": "Federal Bureau of Prisons",
"description": "A facility operated by the Federal Bureau of Prisons",
},
"CDF": {
"expanded_name": "Contract Detention Facility",
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
},
"DIGSA": {
"expanded_name": "Dedicated Intergovernmental Service Agreement",
"description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees – typically these are operated by private contractors pursuant to their agreements with local governments.",
},
"DOD": {
"expanded_name": "Department of Defense",
"description": "Military facility",
"description": "Department of Defence facilities - Often Army bases",
},
"FAMILY": {
"expanded_name": "Family",
"description": "A facility in which families are able to remain together while awaiting their proceedings",
},
"Family": {
"expanded_name": "Family",
"description": "A facility in which families are able to remain together while awaiting their proceedings",
},
"Hospital": {
"expanded_name": "Hospital",
"description": "A medical facility",
},
"IGSA": {
"expanded_name": "Intergovernmental Service Agreement",
"description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts for bed space via an Intergovernmental Service Agreement; or local jails used by ICE pursuant to Intergovernmental Service Agreements, which house both ICE and non-ICE detainees, typically county prisoners awaiting trial or serving short sentences, but sometimes also USMS prisoners.",
},
"Juvenile": {
"expanded_name": "Juvenile",
"description": "An IGSA facility capable of housing juveniles (separate from adults) for a temporary period of time",
},
"Other": {
"expanded_name": "Other",
"description": "Facilities including but not limited to transportation-related facilities, hotels, and/or other facilities",
},
"SPC": {
"expanded_name": "Service Processing Center",
"description": "A facility owned by the government and staffed by a combination of federal and contract employees.",
},
"Unknown": {
"expanded_name": "Unknown",
"description": "A facility whose type could not be identified",
},
"USMS": {
"expanded_name": "United States Marshals Service",
"description": "A facility primarily contracted with the USMS for housing of USMS detainees, in which ICE contracts with the USMS for bed space.",
Expand All @@ -82,10 +90,6 @@
"expanded_name": "United States Marshals Service Contract Detention Facility",
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
},
"CDF": {
"expanded_name": "Contract Detention Facility",
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
},
"Staging": {
"description": "Some facilities in the ICE spreadsheet are marked 'Staging'. Hard to determine why.",
"expanded_name": "Staging",
Expand Down Expand Up @@ -131,6 +135,7 @@
repair_locality, # noqa: F401
repair_street, # noqa: F401
repair_zip, # noqa: F401
repair_name, # noqa: F401
special_facilities, # noqa: F401
update_facility, # noqa: F401
)
Expand Down
2 changes: 1 addition & 1 deletion ice_scrapers/custom_facilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
Handle manually discovered/managed facilities
"""
custom_facilities = {
custom_facilities: dict = {
"2309 North Highway 83,McCook,NE,69001": {
"_repaired_record": False,
"address": {
Expand Down
14 changes: 10 additions & 4 deletions ice_scrapers/facilities_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
repair_locality,
repair_street,
repair_zip,
repair_name,
special_facilities,
update_facility,
)
Expand All @@ -29,6 +30,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
facilities_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
urls = get_ice_scrape_pages(base_scrape_url)

scraped_count = 0
for page_num, url in enumerate(urls):
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
try:
Expand All @@ -37,6 +39,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
logger.error("Error scraping page %s: %s", page_num + 1, e)
logger.debug("Found %s facilities on page %s", len(facilities), page_num + 1)
time.sleep(1) # Be respectful to the server
scraped_count += len(facilities)
for facility in facilities:
facility = special_facilities(facility)
addr = facility["address"]
Expand All @@ -52,6 +55,10 @@ def scrape_facilities(facilities_data: dict) -> dict:
if cleaned:
addr["locality"] = locality
facility["_repaired_record"] = True
name, cleaned = repair_name(facility["name"], addr["locality"])
if cleaned:
facility["name"] = name
facility["_repaired_record"] = True
full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper()
if not facility["address_str"]:
facility["address_str"] = full_address
Expand All @@ -73,12 +80,12 @@ def scrape_facilities(facilities_data: dict) -> dict:
facilities_data["facilities"][facility["name"]] = facility # type: ignore [index]

facilities_data["scrape_runtime"] = time.time() - start_time
logger.info("Total facilities scraped: %s", len(list(facilities_data["facilities"].keys()))) # type: ignore [attr-defined]
logger.info("Total facilities scraped: %s", scraped_count)
logger.info(" Completed in %s seconds", facilities_data["scrape_runtime"])
return facilities_data


def _scrape_updated(url: str):
def _scrape_updated(url: str) -> datetime.datetime:
"""
Scrape url to get "last updated" time
Is specifically oriented around ice.gov facility pages
Expand All @@ -92,7 +99,7 @@ def _scrape_updated(url: str):
response.raise_for_status()
except Exception as e:
logger.error(" Error parsing %s: %s", url, e)
return []
return datetime.datetime.strptime(default_timestamp, timestamp_format)
soup = BeautifulSoup(response.content, "html.parser")
times = soup.findAll("time")
if not times:
Expand Down Expand Up @@ -176,7 +183,6 @@ def _scrape_page(page_url: str) -> list:
facilities.append(facility_data)

logger.info(" Extracted %s facilities from page", len(facilities))

return facilities


Expand Down
4 changes: 2 additions & 2 deletions ice_scrapers/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from schemas import facilities_schema


def facilities_scrape_wrapper() -> dict:
def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True) -> dict:
facilities_data = copy.deepcopy(facilities_schema)
facilities = load_sheet()
facilities = load_sheet(keep_sheet, force_download)
facilities_data["facilities"] = copy.deepcopy(facilities)
facilities_data = scrape_facilities(facilities_data)
field_offices = scrape_field_offices()
Expand Down
Loading