Skip to content

Commit 32c433c

Browse files
authored
Merge pull request #28 from johnseekins/dynamic-scrape-page-count
find pages to scrape rather than hard-coding
2 parents ff94a0c + e867ac1 commit 32c433c

File tree

4 files changed

+34
-7
lines changed

4 files changed

+34
-7
lines changed

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ You can change this in scraper.py and enricher.py.
109109
seems wrong.
110110
* The remote query rate limiting is (I think) done in series but would go faster with parallel/async processing.
111111
* This is only targeted at English (EN) Wikipedia currently, but multi-lingual page checks would help a wider audience.
112-
* `uv run python main.py --load-existing` kicks errors. It doesn't know what to do.
113112

114113
## Contributing & Code Standards
115114

enricher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def enrich_facility_data(self, facilities_data: dict) -> dict:
2828
total = len(facilities_data["facilities"])
2929
processed = 0
3030

31-
for facility_id, facility in enumerate(facilities_data["facilities"]):
31+
for facility_id, facility in facilities_data["facilities"].items():
3232
facility_name = facility["name"]
3333
logger.info("Processing facility %s/%s: %s...", processed + 1, total, facility_name)
3434
enriched_facility = copy.deepcopy(facility)

schemas.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
},
4444
},
4545
"facility_type": "",
46+
"facility_type_detail": {},
4647
"inspection_date": None,
4748
"avg_stay_length": 0,
4849
}
@@ -70,9 +71,13 @@
7071
"description": "A facility primarily contracted with the USMS for housing of USMS detainees, in which ICE contracts with the USMS for bed space.",
7172
},
7273
"USMSIGA": {
73-
"expanded_name": "USMS Intergovernmental Agreement",
74+
"expanded_name": "United States Marshal Service Intergovernmental Agreement",
7475
"description": "A USMS Intergovernmental Agreement in which ICE agrees to utilize an already established US Marshal Service contract.",
7576
},
77+
"USMS CDF": {
78+
"expanded_name": "United States Marshal Service Central Detention Facility",
79+
"description": "Name guessed at from searching",
80+
},
7681
}
7782

7883
# enrichment response object
@@ -90,3 +95,5 @@
9095
"wikidata_found": 0,
9196
"osm_found": 0,
9297
}
98+
99+
default_field_office = "(Possibly) Not managed by DHS field office"

scraper.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
import polars
88
import re
99
from schemas import (
10+
default_field_office,
1011
facilities_schema,
1112
facility_schema,
13+
ice_facility_types,
1214
)
1315
import time
1416
from typing import Tuple
@@ -34,6 +36,7 @@ def __init__(self):
3436

3537
def _download_sheet(self) -> None:
3638
resp = session.get(self.base_xlsx_url, timeout=120)
39+
resp.raise_for_status()
3740
soup = BeautifulSoup(resp.content, "html.parser")
3841
links = soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))
3942
if not links:
@@ -260,9 +263,11 @@ def _load_sheet(self) -> dict:
260263
details["population"]["male"]["allowed"] = True
261264

262265
details["facility_type"] = row["Type Detailed"]
266+
details["facility_type_detail"] = ice_facility_types.get(row["Type Detailed"], {})
263267
details["avg_stay_length"] = row["FY25 ALOS"]
264268
details["inspection_date"] = row["Last Inspection End Date"]
265269
details["source_urls"].append(self.sheet_url)
270+
details["field_office"] = default_field_office
266271
results[full_address] = details
267272
return results
268273

@@ -274,18 +279,32 @@ def _update_facility(self, old: dict, new: dict) -> dict:
274279
old[k] = v
275280
return old
276281

282+
def _get_scrape_pages(self) -> list:
283+
"""Discover all facility pages"""
284+
resp = session.get(self.base_scrape_url, timeout=30)
285+
resp.raise_for_status()
286+
soup = BeautifulSoup(resp.content, "html.parser")
287+
links = soup.findAll("a", href=re.compile(r"\?page="))
288+
if not links:
289+
raise Exception(f"{self.base_scrape_url} contains *no* links?!")
290+
pages = [
291+
f"{self.base_scrape_url}{link['href']}&exposed_form_display=1"
292+
for link in links
293+
if not any(k in link["aria-label"] for k in ["Next", "Last"])
294+
]
295+
logger.debug("Pages discovered: %s", pages)
296+
return pages
297+
277298
def scrape_facilities(self):
278299
"""Scrape all ICE detention facility data from all 6 pages"""
279300
start_time = time.time()
280301
logger.info("Starting to scrape ICE.gov detention facilities...")
281302
self.facilities_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
282303
self.facilities_data["facilities"] = self._load_sheet()
283-
284-
# URLs for all pages
285-
urls = [f"{self.base_scrape_url}?exposed_form_display=1&page={i}" for i in range(6)]
304+
urls = self._get_scrape_pages()
286305

287306
for page_num, url in enumerate(urls):
288-
logger.info("Scraping page %s/6...", page_num + 1)
307+
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
289308
try:
290309
facilities = self._scrape_page(url)
291310
except Exception as e:
@@ -308,6 +327,8 @@ def scrape_facilities(self):
308327
self.facilities_data["facilities"][full_address] = self._update_facility(
309328
self.facilities_data["facilities"][full_address], facility
310329
)
330+
if facility["field_office"]:
331+
self.facilities_data["facilities"][full_address]["field_office"] = facility["field_office"]
311332
# update to the frequently nicer address from ice.gov
312333
self.facilities_data["facilities"][full_address]["address"] = addr
313334
# add scraped urls

0 commit comments

Comments
 (0)