Skip to content

Commit f151363

Browse files
committed
find pages to scrape rather than hard-coding
Signed-off-by: John Seekins <[email protected]>
1 parent 82b25d6 commit f151363

File tree

2 files changed

+21
-4
lines changed

2 files changed

+21
-4
lines changed

enricher.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def enrich_facility_data(self, facilities_data: dict) -> dict:
3232
facility_name = facility["name"]
3333
logger.info("Processing facility %s/%s: %s...", processed + 1, total, facility_name)
3434
enriched_facility = copy.deepcopy(facility)
35+
if not facility["field_office"]:
36+
facility["field_office"] = "(Possibly) Not managed by DHS field office"
3537

3638
# Wikipedia search # todo refactor to method
3739
try:

scraper.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def __init__(self):
3434

3535
def _download_sheet(self) -> None:
3636
resp = session.get(self.base_xlsx_url, timeout=120)
37+
resp.raise_for_status()
3738
soup = BeautifulSoup(resp.content, "html.parser")
3839
links = soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))
3940
if not links:
@@ -274,18 +275,32 @@ def _update_facility(self, old: dict, new: dict) -> dict:
274275
old[k] = v
275276
return old
276277

278+
def _get_scrape_pages(self) -> list:
279+
"""Discover all facility pages"""
280+
resp = session.get(self.base_scrape_url, timeout=30)
281+
resp.raise_for_status()
282+
soup = BeautifulSoup(resp.content, "html.parser")
283+
links = soup.findAll("a", href=re.compile(r"\?page="))
284+
if not links:
285+
raise Exception(f"{self.base_scrape_url} contains *no* links?!")
286+
links = [
287+
f"{self.base_scrape_url}{link['href']}&exposed_form_display=1"
288+
for link in links
289+
if not any(k in link["aria-label"] for k in ["Next", "Last"])
290+
]
291+
logger.debug("Pages discovered: %s", links)
292+
return links
293+
277294
def scrape_facilities(self):
278295
"""Scrape all ICE detention facility data from all 6 pages"""
279296
start_time = time.time()
280297
logger.info("Starting to scrape ICE.gov detention facilities...")
281298
self.facilities_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
282299
self.facilities_data["facilities"] = self._load_sheet()
283-
284-
# URLs for all pages
285-
urls = [f"{self.base_scrape_url}?exposed_form_display=1&page={i}" for i in range(6)]
300+
urls = self._get_scrape_pages()
286301

287302
for page_num, url in enumerate(urls):
288-
logger.info("Scraping page %s/6...", page_num + 1)
303+
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
289304
try:
290305
facilities = self._scrape_page(url)
291306
except Exception as e:

0 commit comments

Comments
 (0)