Skip to content

Commit 7fe5a97

Browse files
authored
Merge pull request #48 from johnseekins/small-improvements
Add custom facilities and some small schema improvements
2 parents fb4dbf1 + a5c2363 commit 7fe5a97

File tree

11 files changed

+221
-120
lines changed

11 files changed

+221
-120
lines changed

README.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,6 @@ scrape of the data from ICE.gov. This is stored in `default_data.py` and include
1818

1919
> Note ICE has been renaming known "detention center" sites to "processing center", and so on.
2020
21-
The initial scrape data also keeps a `base64` ecoded string containing the original HTML that was scraped from ice.gov about the
22-
facility. Keeping this initial data allows us to verify the resulting extracted data if we need to.
23-
2421
It also shows the ICE "field office" managing each detention facility.
2522

2623
On the OpenStreetMap (OSM) CSV results, if the URL includes a "way" then it has probably identified the correctly tagged

default_data.py

Lines changed: 41 additions & 40 deletions
Large diffs are not rendered by default.

ice_scrapers/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,7 @@ and contact info for the field office.
3636
> The field-offices page shows information about a number of different offices. As we
3737
> are largely focused on detention, ERO (Enforcement and Removal Operations) centers
3838
> are the most interesting.
39+
40+
## custom_facilities.py
41+
42+
Some facilities we may discover manually. Or they may be "pending" classification, but we discover them early on. These facilities are defined here.

ice_scrapers/__init__.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,14 @@
8282
"expanded_name": "Contract Detention Facility",
8383
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
8484
},
85+
"Staging": {
86+
"description": "Some facilities in the ICE spreadsheet are marked 'Staging'. Hard to determine why.",
87+
"expanded_name": "Staging",
88+
},
89+
"Pending": {
90+
"expanded_name": "Pending Classication and Inclusion",
91+
"description": "Facilities discovered through other means that may become ICE/DHS facilities",
92+
},
8593
}
8694

8795
# ICE AOR mappings
@@ -115,10 +123,10 @@
115123
field_office_to_aor = {v: k for k, v in area_of_responsibility.items()}
116124

117125
from .utils import ( # noqa: E402
118-
clean_street, # noqa: F401
119126
get_ice_scrape_pages, # noqa: F401
120-
repair_zip, # noqa: F401
121127
repair_locality, # noqa: F401
128+
repair_street, # noqa: F401
129+
repair_zip, # noqa: F401
122130
update_facility, # noqa: F401
123131
)
124132
from .facilities_scraper import scrape_facilities # noqa: F401,E402
@@ -127,3 +135,5 @@
127135
merge_field_offices, # noqa: F401
128136
scrape_field_offices, # noqa: F401
129137
)
138+
from .custom_facilities import insert_additional_facilities # noqa: F401,E402
139+
from .general import facilities_scrape_wrapper # noqa: F401,E402

ice_scrapers/custom_facilities.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import copy
2+
3+
"""
4+
Handle manually discovered/managed facilities
5+
"""
6+
custom_facilities = {
7+
"North Highway 83,McCook,NE,69001": {
8+
"_repaired_record": False,
9+
"address": {
10+
"administrative_area": "NE",
11+
"country": "United States",
12+
"locality": "McCook",
13+
"postal_code": "69001",
14+
"street": "North Highway 83",
15+
},
16+
"address_str": "North Highway 83,McCook,NE,69001",
17+
"facility_type": {
18+
"expanded_name": "Pending Classication and Inclusion",
19+
"description": "Facilities discovered through other means that may become ICE/DHS facilities",
20+
"id": "Pending",
21+
},
22+
"field_office": {
23+
"address": {
24+
"administrative_area": "MN",
25+
"country": "United States",
26+
"locality": "Fort Snelling",
27+
"postal_code": "55111",
28+
"street": "1 Federal Drive Suite 1601",
29+
},
30+
"address_str": "1 Federal Drive Suite 1601 Fort Snelling, MN 55111",
31+
"aor": "Iowa, Minnesota, Nebraska, North Dakota, South Dakota",
32+
"email": "[email protected]",
33+
"field_office": "St Paul Field Office",
34+
"id": "SPM",
35+
"name": "Fort Snelling - ERO",
36+
"phone": "(612) 843-8600",
37+
"source_urls": ["https://www.ice.gov/contact/field-offices?page=2&exposed_form_display=1"],
38+
},
39+
"image_url": "https://corrections.nebraska.gov/sites/default/files/2024-08/wec_thumbnail.jpg",
40+
"inspection": {
41+
"last_date": None,
42+
"last_rating": "",
43+
"last_type": "",
44+
},
45+
"name": "Work Ethic Camp",
46+
"osm": {
47+
"latitude": 40.22851,
48+
"longitude": -100.548001,
49+
"search_query": "",
50+
"url": "https://www.openstreetmap.org/way/456014773#map=19/40.228251/-100.648001",
51+
},
52+
"page_updated_date": None,
53+
"phone": "308-345-8405",
54+
"population": {
55+
"female": {"allowed": False, "criminal": 0, "non_criminal": 0},
56+
"avg_stay_length": 0,
57+
"ice_threat_level": {
58+
"level_1": 0,
59+
"level_2": 0,
60+
"level_3": 0,
61+
"none": 0,
62+
},
63+
"male": {"allowed": False, "criminal": 0, "non_criminal": 0},
64+
"housing": {"guaranteed_min": 0, "mandatory": 0},
65+
"security_threat": {
66+
"high": 0,
67+
"low": 0,
68+
"medium_high": 0,
69+
"medium_low": 0,
70+
},
71+
"total": 0,
72+
},
73+
"source_urls": [
74+
"https://corrections.nebraska.gov/facilities/work-ethic-camp",
75+
],
76+
"wikidata": {"page_url": "", "search_query": ""},
77+
"wikipedia": {"page_url": "", "search_query": ""},
78+
},
79+
}
80+
81+
82+
def insert_additional_facilities(facilities_data: dict) -> dict:
83+
for facility_id, facility in custom_facilities.items():
84+
facilities_data["facilities"][facility_id] = copy.deepcopy(facility)
85+
return facilities_data

ice_scrapers/facilities_scraper.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
import datetime
44
import re
55
from ice_scrapers import (
6-
clean_street,
76
get_ice_scrape_pages,
8-
repair_zip,
97
repair_locality,
8+
repair_street,
9+
repair_zip,
1010
update_facility,
1111
)
1212
from schemas import facility_schema
@@ -38,7 +38,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
3838
time.sleep(1) # Be respectful to the server
3939
for facility in facilities:
4040
addr = facility["address"]
41-
street, cleaned = clean_street(addr["street"], addr["locality"])
41+
street, cleaned = repair_street(addr["street"], addr["locality"])
4242
if cleaned:
4343
addr["street"] = street
4444
facility["_repaired_record"] = True

ice_scrapers/general.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import copy
2+
from ice_scrapers import (
3+
insert_additional_facilities,
4+
load_sheet,
5+
merge_field_offices,
6+
scrape_facilities,
7+
scrape_field_offices,
8+
)
9+
from schemas import facilities_schema
10+
11+
12+
def facilities_scrape_wrapper() -> dict:
13+
facilities_data = copy.deepcopy(facilities_schema)
14+
facilities = load_sheet()
15+
facilities_data["facilities"] = copy.deepcopy(facilities)
16+
facilities_data = scrape_facilities(facilities_data)
17+
field_offices = scrape_field_offices()
18+
facilities_data = merge_field_offices(facilities_data, field_offices)
19+
facilities_data = insert_additional_facilities(facilities_data)
20+
21+
return facilities_data

ice_scrapers/spreadsheet_load.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
field_office_schema,
1010
)
1111
from ice_scrapers import (
12-
clean_street,
1312
facility_sheet_header,
1413
ice_facility_types,
1514
ice_inspection_types,
16-
repair_zip,
1715
repair_locality,
16+
repair_street,
17+
repair_zip,
1818
)
1919
from typing import Tuple
2020
from utils import (
@@ -84,7 +84,7 @@ def load_sheet(keep_sheet: bool = True) -> dict:
8484
zcode, cleaned = repair_zip(row["Zip"], row["City"])
8585
if cleaned:
8686
details["_repaired_record"] = True
87-
street, cleaned = clean_street(row["Address"], row["City"])
87+
street, cleaned = repair_street(row["Address"], row["City"])
8888
if cleaned:
8989
details["_repaired_record"] = True
9090
match = phone_re.search(row["Address"])
@@ -101,7 +101,9 @@ def load_sheet(keep_sheet: bool = True) -> dict:
101101
details["address"]["street"] = street
102102
details["name"] = row["Name"]
103103

104-
# population statistics
104+
"""
105+
population statistics
106+
"""
105107
details["population"]["male"]["criminal"] = row["Male Crim"]
106108
details["population"]["male"]["non_criminal"] = row["Male Non-Crim"]
107109
details["population"]["female"]["criminal"] = row["Female Crim"]
@@ -123,27 +125,22 @@ def load_sheet(keep_sheet: bool = True) -> dict:
123125
"level_3": row["ICE Threat Level 3"],
124126
"none": row["No ICE Threat Level"],
125127
}
126-
"""
127-
# extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx 2025-09-22
128-
Upon admission and periodically thereafter, detainees are categorized into a security level based on a variety of public safety factors, and are housed accordingly. Factors include prior convictions, threat risk, disciplinary record, special vulnerabilities, and special management concerns. Detainees are categorized into one of four classes of security risk: A/low, B/medium low, C/medium high, and D/high.
129-
"""
128+
# Levels extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats09112025.xlsx 2025-09-22
130129
details["population"]["security_threat"]["low"] = row["Level A"]
131130
details["population"]["security_threat"]["medium_low"] = row["Level B"]
132131
details["population"]["security_threat"]["medium_high"] = row["Level C"]
133132
details["population"]["security_threat"]["high"] = row["Level D"]
133+
details["population"]["housing"]["mandatory"] = row["Mandatory"]
134+
details["population"]["housing"]["guaranteed_min"] = row["Guaranteed Minimum"]
135+
details["population"]["avg_stay_length"] = row["FY25 ALOS"]
134136

135137
details["facility_type"] = {
136138
"id": row["Type Detailed"],
137-
"housing": {
138-
"mandatory": row["Mandatory"],
139-
"guaranteed_min": row["Guaranteed Minimum"],
140-
},
141139
}
142140
ft_details = ice_facility_types.get(row["Type Detailed"], {})
143141
if ft_details:
144142
details["facility_type"]["description"] = ft_details["description"]
145143
details["facility_type"]["expanded_name"] = ft_details["expanded_name"]
146-
details["avg_stay_length"] = row["FY25 ALOS"]
147144
details["inspection"] = {
148145
# fall back to type code
149146
"last_type": ice_inspection_types.get(row["Last Inspection Type"], row["Last Inspection Type"]),

ice_scrapers/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
)
88

99

10-
def clean_street(street: str, locality: str = "") -> Tuple[str, bool]:
10+
def repair_street(street: str, locality: str = "") -> Tuple[str, bool]:
1111
"""Generally, we'll let the spreadsheet win arguments just to be consistent"""
1212
street_filters = [
1313
# address mismatch between site and spreadsheet

main.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,9 @@
2323
import logging
2424
from file_utils import export_to_file, print_summary
2525
import default_data
26-
from ice_scrapers import (
27-
load_sheet,
28-
merge_field_offices,
29-
scrape_facilities,
30-
scrape_field_offices,
31-
)
26+
from ice_scrapers import facilities_scrape_wrapper
3227
from enrichers import enrich_facility_data
33-
from schemas import (
34-
facilities_schema,
35-
supported_output_types,
36-
)
28+
from schemas import supported_output_types
3729
from utils import logger
3830
# CLI, argument parsing, script orchestration
3931

@@ -116,18 +108,12 @@ def main() -> None:
116108
logger.warning(
117109
"Warning: --debug-wikipedia, --debug-wikidata and --debug-osm are currently not implemented as command line options."
118110
)
119-
facilities_data = copy.deepcopy(facilities_schema)
120-
121111
if args.scrape and args.load_existing:
122112
logger.error("Can't scrape and load existing data!")
123113
exit(1)
124114

125115
if args.scrape:
126-
facilities = load_sheet()
127-
facilities_data["facilities"] = copy.deepcopy(facilities)
128-
facilities_data = scrape_facilities(facilities_data)
129-
field_offices = scrape_field_offices()
130-
facilities_data = merge_field_offices(facilities_data, field_offices)
116+
facilities_data = facilities_scrape_wrapper()
131117
elif args.load_existing:
132118
facilities_data = copy.deepcopy(default_data.facilities_data)
133119
logger.info(

0 commit comments

Comments
 (0)