Skip to content

Commit f3167a5

Browse files
authored
Merge pull request #32 from johnseekins/additional-field-offices . collect and match all field offices
collect and match all field offices
2 parents 1f8186d + 58f9cc9 commit f3167a5

File tree

7 files changed

+610
-47
lines changed

7 files changed

+610
-47
lines changed

default_data.py

Lines changed: 340 additions & 20 deletions
Large diffs are not rendered by default.

field_offices.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# ICEFieldOfficeScraper class and scraping-related code
2+
import base64
3+
from bs4 import BeautifulSoup
4+
import copy
5+
import datetime
6+
import re
7+
from schemas import (
8+
field_offices_schema,
9+
field_office_schema,
10+
)
11+
import time
12+
from utils import (
13+
logger,
14+
session,
15+
)
16+
17+
18+
class ICEFieldOfficeScraper(object):
19+
base_scrape_url = "https://www.ice.gov/contact/field-offices"
20+
21+
def __init__(self):
22+
self.office_data = copy.deepcopy(field_offices_schema)
23+
24+
def _get_scrape_pages(self) -> list:
25+
"""Discover all facility pages"""
26+
resp = session.get(self.base_scrape_url, timeout=30)
27+
resp.raise_for_status()
28+
soup = BeautifulSoup(resp.content, "html.parser")
29+
links = soup.findAll("a", href=re.compile(r"\?page="))
30+
if not links:
31+
raise Exception(f"{self.base_scrape_url} contains *no* links?!")
32+
pages = [
33+
f"{self.base_scrape_url}{link['href']}&exposed_form_display=1"
34+
for link in links
35+
if not any(k in link["aria-label"] for k in ["Next", "Last"])
36+
]
37+
logger.debug("Pages discovered: %s", pages)
38+
return pages
39+
40+
def scrape_field_offices(self) -> dict:
41+
"""Collect data on ICE field offices"""
42+
start_time = time.time()
43+
self.office_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
44+
logger.info("Starting to scrape ICE.gov field offices...")
45+
urls = self._get_scrape_pages()
46+
for page_num, url in enumerate(urls):
47+
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
48+
offices = self._scrape_page(url)
49+
logger.debug("Found %s offices on page %s", len(offices), page_num + 1)
50+
time.sleep(1) # Be respectful to the server
51+
for office in offices:
52+
self.office_data["field_offices"][office["field_office"]] = office
53+
self.office_data["scrape_runtime"] = time.time() - start_time
54+
logger.info("Total field offices scraped: %s", len(self.office_data["field_offices"]))
55+
logger.info(" Completed in %s seconds", self.office_data["scrape_runtime"])
56+
return self.office_data
57+
58+
def _scrape_page(self, page_url: str) -> list:
59+
"""Scrape a single page of facilities using BeautifulSoup"""
60+
logger.debug(" Fetching: %s", page_url)
61+
try:
62+
response = session.get(page_url, timeout=30)
63+
response.raise_for_status()
64+
except Exception as e:
65+
logger.error(" Error parsing %s: %s", page_url, e)
66+
return []
67+
# Parse HTML with BeautifulSoup
68+
soup = BeautifulSoup(response.content, "html.parser")
69+
offices = []
70+
71+
# Look for the main content area - ICE uses different possible containers
72+
content_selectors = [
73+
"div.view-content", # Primary content container
74+
"div.views-rows", # Alternative container
75+
"ul.views-rows", # List-based container
76+
"div.region-content", # Region content
77+
"main", # HTML5 main element
78+
"div.content", # Generic content
79+
]
80+
content_container = None
81+
logger.debug("Searching %s for content", page_url)
82+
for selector in content_selectors:
83+
content_container = soup.select_one(selector)
84+
if content_container:
85+
logger.debug(" Found content using selector: %s", selector)
86+
break
87+
88+
if not content_container:
89+
logger.warning(" Warning: Could not find content container, searching entire page")
90+
content_container = soup
91+
92+
# Look for facility entries - try multiple patterns
93+
office_selectors = [
94+
"li.grid", # List items with grid class
95+
"div.views-row", # View rows
96+
"li.views-row", # List-based view rows
97+
"div.facility-item", # Custom facility items
98+
"article", # Article elements
99+
"div.node", # Drupal node containers
100+
]
101+
office_elements: list = []
102+
for selector in office_selectors:
103+
elements = content_container.select(selector)
104+
if elements:
105+
office_elements = elements
106+
logger.debug(
107+
" Found %s office elements using selector: %s",
108+
len(elements),
109+
selector,
110+
)
111+
break
112+
113+
# if not office_elements:
114+
# # Fallback: look for any element containing office-like text patterns
115+
# logger.warning(" Using fallback: searching for office patterns in text")
116+
# office_elements = self._find_office_patterns(content_container)
117+
118+
# Extract data from each office element
119+
for element in office_elements:
120+
office_data = self._extract_single_office(element, page_url)
121+
if office_data and office_data.get("name", None):
122+
offices.append(office_data)
123+
logger.info(" Extracted %s field offices from page", len(offices))
124+
125+
return offices
126+
127+
def _extract_single_office(self, element: BeautifulSoup, page_url: str) -> dict:
128+
"""Extract data from a single office element"""
129+
office = copy.deepcopy(field_office_schema)
130+
raw_scrape = str(element)
131+
office["raw_scrape"] = base64.b64encode(raw_scrape.encode("utf-8")).decode("utf-8")
132+
office["source_urls"].append(page_url)
133+
logger.debug("Trying to get office data from %s", element)
134+
office_name = element.select_one(".views-field-field-field-office-location")
135+
if not office_name or not office_name.text.strip().endswith("ERO"):
136+
logger.debug(" Skipping %s because it is not an ERO location", office_name.text) # type: ignore [union-attr]
137+
# not a field office
138+
return {}
139+
office["name"] = office_name.text.strip()
140+
field_office = element.select_one(".views-field-title")
141+
if field_office:
142+
office["field_office"] = field_office.text.strip()
143+
address = element.select_one(".address-line1")
144+
if address:
145+
office["address"]["street"] = address.text.strip()
146+
# optional line 2 of address
147+
address = element.select_one(".address-line2")
148+
if address:
149+
office["address"]["street"] = f"{office['address']['street']} {address.text.strip()}"
150+
locality = element.select_one(".locality")
151+
if locality:
152+
office["address"]["locality"] = locality.text.strip()
153+
administrative_area = element.select_one(".administrative-area")
154+
if administrative_area:
155+
office["address"]["administrative_area"] = administrative_area.text.strip()
156+
postal_code = element.select_one(".postal-code")
157+
if postal_code:
158+
office["address"]["postal_code"] = postal_code.text.strip()
159+
office["address_str"] = (
160+
f"{office['address']['street']} {office['address']['locality']}, {office['address']['administrative_area']} {office['address']['postal_code']}"
161+
)
162+
country = element.select_one(".country")
163+
if country:
164+
office["address"]["country"] = country.text.strip()
165+
phone = element.select_one(".ct-addr")
166+
if phone:
167+
office["phone"] = phone.text.strip()
168+
details = element.select_one(".views-field-body")
169+
email = details.findAll("a") # type: ignore [union-attr]
170+
if email:
171+
office["email"] = email[0]["href"].split(":", 1)[1]
172+
detail_txt = details.text # type: ignore [union-attr]
173+
logger.debug("Detail text: %s", detail_txt)
174+
aor_match = re.match(r"Area of Responsibility:(.+)\n?Email", detail_txt)
175+
if aor_match:
176+
office["aor"] = aor_match.group(1).strip().replace("\xa0", " ")
177+
178+
logger.debug("Returning %s", office)
179+
return office

file_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ def print_summary(facilities_data: dict) -> None:
5858
# Count by field office
5959
field_offices: dict = {}
6060
for facility_id, facility in facilities_data["facilities"].items():
61-
field_offices[facility["field_office"]] = field_offices.get(facility["field_office"], 0) + 1
61+
office = facility.get("field_office", {}).get("field_office", "Unknown")
62+
field_offices[office] = field_offices.get(office, 0) + 1
6263

6364
logger.info("\nFacilities by Field Office:")
6465
for office, count in sorted(field_offices.items(), key=lambda x: x[1], reverse=True):

main.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from enricher import ExternalDataEnricher
2727
from schemas import supported_output_types
2828
from scraper import ICEGovFacilityScraper
29+
from field_offices import ICEFieldOfficeScraper
2930
from utils import logger
3031
# CLI, argument parsing, script orchestration
3132

@@ -113,11 +114,16 @@ def main() -> None:
113114
exit(1)
114115

115116
if args.scrape:
116-
scraper = ICEGovFacilityScraper()
117+
fo_scraper = ICEFieldOfficeScraper()
118+
field_offices = fo_scraper.scrape_field_offices()
119+
scraper = ICEGovFacilityScraper(field_offices)
117120
facilities_data = scraper.scrape_facilities()
118121
elif args.load_existing:
119122
facilities_data = copy.deepcopy(default_data.facilities_data)
120-
logger.info("Loaded %s existing facilities from local data. (Not scraping)", len(facilities_data["facilities"]))
123+
logger.info(
124+
"Loaded %s existing facilities from local data. (Not scraping)",
125+
len(facilities_data["facilities"].keys()), # type: ignore [attr-defined]
126+
)
121127

122128
if args.enrich:
123129
if not facilities_data:

schemas.py

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import copy
12
import datetime
23

34
# default to Washington, D.C.?
@@ -13,6 +14,29 @@
1314
"facilities": {},
1415
}
1516

17+
field_offices_schema: dict = {
18+
"field_offices": {},
19+
"scraped_date": datetime.datetime.now(datetime.UTC),
20+
"scrape_runtime": 0,
21+
}
22+
23+
field_office_schema: dict = {
24+
"name": "",
25+
"field_office": "",
26+
"address_str": "",
27+
"address": {
28+
"administrative_area": "",
29+
"country": "",
30+
"locality": "",
31+
"postal_code": "",
32+
"street": "",
33+
},
34+
"aor": "",
35+
"email": "",
36+
"raw_scrape": "",
37+
"source_urls": [],
38+
}
39+
1640
# default keys to "false"-y values so we can merge easier
1741
facility_schema: dict = {
1842
"address": {
@@ -24,7 +48,7 @@
2448
},
2549
"address_str": "",
2650
"_repaired_record": False,
27-
"field_office": "",
51+
"field_office": copy.deepcopy(field_office_schema),
2852
"image_url": "",
2953
"name": "",
3054
"phone": "",
@@ -101,6 +125,7 @@
101125
"expanded_name": "United States Marshals Service",
102126
"description": "A facility primarily contracted with the USMS for housing of USMS detainees, in which ICE contracts with the USMS for bed space.",
103127
},
128+
# two keys for the same thing as it isn't consistently defined
104129
"USMSIGA": {
105130
"expanded_name": "United States Marshal Service Intergovernmental Agreement",
106131
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
@@ -110,15 +135,43 @@
110135
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
111136
},
112137
"USMS CDF": {
113-
"expanded_name": "United States Marshal Service Central Detention Facility",
114-
"description": "Name guessed at from searching",
138+
"expanded_name": "United States Marshal Service Contract Detention Facility",
139+
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
115140
},
116141
"CDF": {
117-
"expanded_name": "Central Detention Facility",
118-
"description": "Name guessed at from searching",
142+
"expanded_name": "Contract Detention Facility",
143+
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
119144
},
120145
}
121146

147+
# ICE AOR mappings
148+
area_of_responsibility = {
149+
"ATL": "Atlanta Field Office",
150+
"BOS": "Boston Field Office",
151+
"BUF": "Buffalo Field Office",
152+
"CHI": "Chicago Field Office",
153+
"DAL": "Dallas Field Office",
154+
"DEN": "Denver Field Office",
155+
"DET": "Detroit Field Office",
156+
"ELP": "El Paso Field Office",
157+
"HLG": "Harlingen Field Office",
158+
"HOU": "Houston Field Office",
159+
"LOS": "Los Angeles Field Office",
160+
"MIA": "Miami Field Office",
161+
"NEW": "Newark Field Office",
162+
"NOL": "New Orleans Field Office",
163+
"NYC": "New York City Field Office",
164+
"PHI": "Philadelphia Field Office",
165+
"PHO": "Phoenix Field Office",
166+
"SEA": "Seattle Field Office",
167+
"SFR": "San Francisco Field Office",
168+
"SLC": "Salt Lake City Field Office",
169+
"SNA": "San Antonio Field Office",
170+
"SND": "San Diego Field Office",
171+
"SPM": "St Paul Field Office",
172+
"WAS": "Washington Field Office",
173+
}
174+
122175
# enrichment response object
123176
resp_info_schema = {
124177
"original_name": "",
@@ -136,6 +189,4 @@
136189
"osm_found": 0,
137190
}
138191

139-
default_field_office = "(Possibly) Not managed by DHS field office"
140-
141192
supported_output_types = ["csv", "json", "xlsx", "parquet"]

0 commit comments

Comments
 (0)