Skip to content

Commit 06162c3

Browse files
committed
DRY and docs
Signed-off-by: John Seekins <[email protected]>
1 parent ef62cfe commit 06162c3

File tree

5 files changed

+92
-73
lines changed

5 files changed

+92
-73
lines changed

ice_scrapers/README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# ICE Facility scrapers
2+
3+
These files maintain the code to collect (and collate) ICE facility data from a number of sources.
4+
5+
## utils.py
6+
7+
Contains most of our collating functions and shared functions that scrapers may need.
8+
9+
## __init__.py
10+
11+
Contains some static objects and import declarations (so we can `from ice_scrapers import` successfully)...
12+
13+
## spreadsheet_load.py
14+
15+
ICE is required by law to produce regular custody data. We can pull that data from here `https://www.ice.gov/detain/detention-management`. Because this spreadsheet is more "complete" than other sources we've found, we use it as our base scrape.
16+
17+
## facilities_scraper.py
18+
19+
Pulls information about ICE detention facilities from `https://www.ice.gov/detention-facilities`. This can add additional (or corrected) data about facilities locations, contact information, and provides facility images.
20+
21+
## field_offices.py
22+
23+
Collects additional data about ICE/DHS field offices from `https://www.ice.gov/contact/field-offices`. Largely basic areas of responsibility and contact info for the field office.
24+
25+
> The field-offices page shows information about a number of different offices. As we are largely focused on detention, ERO (Eforcement and Removal Operations) centers are the most interesting.

ice_scrapers/__init__.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,44 @@
7777
},
7878
}
7979

80+
# ICE AOR mappings
81+
area_of_responsibility = {
82+
"ATL": "Atlanta Field Office",
83+
"BAL": "Baltimore Field Office",
84+
"BOS": "Boston Field Office",
85+
"BUF": "Buffalo Field Office",
86+
"CHI": "Chicago Field Office",
87+
"DAL": "Dallas Field Office",
88+
"DEN": "Denver Field Office",
89+
"DET": "Detroit Field Office",
90+
"ELP": "El Paso Field Office",
91+
"HLG": "Harlingen Field Office",
92+
"HOU": "Houston Field Office",
93+
"LOS": "Los Angeles Field Office",
94+
"MIA": "Miami Field Office",
95+
"NEW": "Newark Field Office",
96+
"NOL": "New Orleans Field Office",
97+
"NYC": "New York City Field Office",
98+
"PHI": "Philadelphia Field Office",
99+
"PHO": "Phoenix Field Office",
100+
"SEA": "Seattle Field Office",
101+
"SFR": "San Francisco Field Office",
102+
"SLC": "Salt Lake City Field Office",
103+
"SNA": "San Antonio Field Office",
104+
"SND": "San Diego Field Office",
105+
"SPM": "St Paul Field Office",
106+
"WAS": "Washington Field Office",
107+
}
108+
field_office_to_aor = {v: k for k, v in area_of_responsibility.items()}
109+
80110
from .utils import ( # noqa: E402
81111
clean_street, # noqa: F401
112+
get_ice_scrape_pages, # noqa: F401
82113
repair_zip, # noqa: F401
83114
repair_locality, # noqa: F401
84115
update_facility, # noqa: F401
85116
)
86-
from .page_load import scrape_facilities # noqa: F401,E402
117+
from .facilities_scraper import scrape_facilities # noqa: F401,E402
87118
from .spreadsheet_load import load_sheet # noqa: F401,E402
88119
from .field_offices import ( # noqa: E402
89120
merge_field_offices, # noqa: F401

ice_scrapers/page_load.py renamed to ice_scrapers/facilities_scraper.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
# scraping-related code for ice.gov detention facility pages
21
from bs4 import BeautifulSoup
32
import copy
43
import datetime
54
import re
65
from ice_scrapers import (
76
clean_street,
7+
get_ice_scrape_pages,
88
repair_zip,
99
repair_locality,
1010
update_facility,
@@ -21,29 +21,12 @@
2121
base_scrape_url = "https://www.ice.gov/detention-facilities"
2222

2323

24-
def _get_scrape_pages() -> list:
25-
"""Discover all facility pages"""
26-
resp = session.get(base_scrape_url, timeout=30)
27-
resp.raise_for_status()
28-
soup = BeautifulSoup(resp.content, "html.parser")
29-
links = soup.findAll("a", href=re.compile(r"\?page="))
30-
if not links:
31-
raise Exception(f"{base_scrape_url} contains *no* links?!")
32-
pages = [
33-
f"{base_scrape_url}{link['href']}&exposed_form_display=1"
34-
for link in links
35-
if not any(k in link["aria-label"] for k in ["Next", "Last"])
36-
]
37-
logger.debug("Pages discovered: %s", pages)
38-
return pages
39-
40-
4124
def scrape_facilities(facilities_data: dict) -> dict:
4225
"""Scrape all ICE detention facility data from all discovered pages"""
4326
start_time = time.time()
4427
logger.info("Starting to scrape ICE.gov detention facilities...")
4528
facilities_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
46-
urls = _get_scrape_pages()
29+
urls = get_ice_scrape_pages(base_scrape_url)
4730

4831
for page_num, url in enumerate(urls):
4932
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))

ice_scrapers/field_offices.py

Lines changed: 7 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
from bs4 import BeautifulSoup
33
import copy
44
import datetime
5+
from ice_scrapers import (
6+
area_of_responsibility,
7+
field_office_to_aor,
8+
get_ice_scrape_pages,
9+
)
510
import re
611
from schemas import (
712
field_offices_schema,
@@ -15,61 +20,14 @@
1520

1621
base_scrape_url = "https://www.ice.gov/contact/field-offices"
1722

18-
# ICE AOR mappings
19-
area_of_responsibility = {
20-
"ATL": "Atlanta Field Office",
21-
"BAL": "Baltimore Field Office",
22-
"BOS": "Boston Field Office",
23-
"BUF": "Buffalo Field Office",
24-
"CHI": "Chicago Field Office",
25-
"DAL": "Dallas Field Office",
26-
"DEN": "Denver Field Office",
27-
"DET": "Detroit Field Office",
28-
"ELP": "El Paso Field Office",
29-
"HLG": "Harlingen Field Office",
30-
"HOU": "Houston Field Office",
31-
"LOS": "Los Angeles Field Office",
32-
"MIA": "Miami Field Office",
33-
"NEW": "Newark Field Office",
34-
"NOL": "New Orleans Field Office",
35-
"NYC": "New York City Field Office",
36-
"PHI": "Philadelphia Field Office",
37-
"PHO": "Phoenix Field Office",
38-
"SEA": "Seattle Field Office",
39-
"SFR": "San Francisco Field Office",
40-
"SLC": "Salt Lake City Field Office",
41-
"SNA": "San Antonio Field Office",
42-
"SND": "San Diego Field Office",
43-
"SPM": "St Paul Field Office",
44-
"WAS": "Washington Field Office",
45-
}
46-
field_office_to_aor = {v: k for k, v in area_of_responsibility.items()}
47-
48-
49-
def _get_scrape_pages() -> list:
50-
"""Discover all facility pages"""
51-
resp = session.get(base_scrape_url, timeout=30)
52-
resp.raise_for_status()
53-
soup = BeautifulSoup(resp.content, "html.parser")
54-
links = soup.findAll("a", href=re.compile(r"\?page="))
55-
if not links:
56-
raise Exception(f"{base_scrape_url} contains *no* links?!")
57-
pages = [
58-
f"{base_scrape_url}{link['href']}&exposed_form_display=1"
59-
for link in links
60-
if not any(k in link["aria-label"] for k in ["Next", "Last"])
61-
]
62-
logger.debug("Pages discovered: %s", pages)
63-
return pages
64-
6523

6624
def scrape_field_offices() -> dict:
6725
"""Collect data on ICE field offices"""
6826
start_time = time.time()
6927
office_data = copy.deepcopy(field_offices_schema)
7028
office_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
7129
logger.info("Starting to scrape ICE.gov field offices...")
72-
urls = _get_scrape_pages()
30+
urls = get_ice_scrape_pages(base_scrape_url)
7331
for page_num, url in enumerate(urls):
7432
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
7533
offices = _scrape_page(url)
@@ -138,11 +96,6 @@ def _scrape_page(page_url: str) -> list:
13896
)
13997
break
14098

141-
# if not office_elements:
142-
# # Fallback: look for any element containing office-like text patterns
143-
# logger.warning(" Using fallback: searching for office patterns in text")
144-
# office_elements = _find_office_patterns(content_container)
145-
14699
# Extract data from each office element
147100
for element in office_elements:
148101
office_data = _extract_single_office(element, page_url)
@@ -208,6 +161,7 @@ def _extract_single_office(element: BeautifulSoup, page_url: str) -> dict:
208161

209162

210163
def merge_field_offices(facilities_data: dict, field_offices: dict) -> dict:
164+
"""Actually insert field office data into our facilities_data object"""
211165
final_facilities = copy.deepcopy(facilities_data["facilities"])
212166
for facility_id, facility in facilities_data["facilities"].items():
213167
office = field_offices["field_offices"].get(facility["field_office"]["field_office"], None)

ice_scrapers/utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1+
from bs4 import BeautifulSoup
2+
import re
13
from typing import Tuple
4+
from utils import (
5+
logger,
6+
session,
7+
)
28

39

410
def clean_street(street: str, locality: str = "") -> Tuple[str, bool]:
@@ -179,3 +185,23 @@ def update_facility(old: dict, new: dict) -> dict:
179185
if not old.get(k, None):
180186
old[k] = v
181187
return old
188+
189+
190+
def get_ice_scrape_pages(url: str) -> list:
191+
"""
192+
Discover all facility pages
193+
This _may_ be generic to Drupal's pagination code...
194+
"""
195+
resp = session.get(url, timeout=30)
196+
resp.raise_for_status()
197+
soup = BeautifulSoup(resp.content, "html.parser")
198+
links = soup.findAll("a", href=re.compile(r"\?page="))
199+
if not links:
200+
raise Exception(f"{url} contains *no* links?!")
201+
pages = [
202+
f"{url}{link['href']}&exposed_form_display=1"
203+
for link in links
204+
if not any(k in link["aria-label"] for k in ["Next", "Last"])
205+
]
206+
logger.debug("Pages discovered: %s", pages)
207+
return pages

0 commit comments

Comments
 (0)