@@ -34,6 +34,7 @@ def __init__(self):
3434
3535 def _download_sheet (self ) -> None :
3636 resp = session .get (self .base_xlsx_url , timeout = 120 )
37+ resp .raise_for_status ()
3738 soup = BeautifulSoup (resp .content , "html.parser" )
3839 links = soup .findAll ("a" , href = re .compile ("^https://www.ice.gov/doclib.*xlsx" ))
3940 if not links :
@@ -274,18 +275,32 @@ def _update_facility(self, old: dict, new: dict) -> dict:
274275 old [k ] = v
275276 return old
276277
278+ def _get_scrape_pages (self ) -> list :
279+ """Discover all facility pages"""
280+ resp = session .get (self .base_scrape_url , timeout = 30 )
281+ resp .raise_for_status ()
282+ soup = BeautifulSoup (resp .content , "html.parser" )
283+ links = soup .findAll ("a" , href = re .compile (r"\?page=" ))
284+ if not links :
285+ raise Exception (f"{ self .base_scrape_url } contains *no* links?!" )
286+ links = [
287+ f"{ self .base_scrape_url } { link ['href' ]} &exposed_form_display=1"
288+ for link in links
289+ if not any (k in link ["aria-label" ] for k in ["Next" , "Last" ])
290+ ]
291+ logger .debug ("Pages discovered: %s" , links )
292+ return links
293+
277294 def scrape_facilities (self ):
278295 """Scrape all ICE detention facility data from all 6 pages"""
279296 start_time = time .time ()
280297 logger .info ("Starting to scrape ICE.gov detention facilities..." )
281298 self .facilities_data ["scraped_date" ] = datetime .datetime .now (datetime .UTC )
282299 self .facilities_data ["facilities" ] = self ._load_sheet ()
283-
284- # URLs for all pages
285- urls = [f"{ self .base_scrape_url } ?exposed_form_display=1&page={ i } " for i in range (6 )]
300+ urls = self ._get_scrape_pages ()
286301
287302 for page_num , url in enumerate (urls ):
288- logger .info ("Scraping page %s/6 ..." , page_num + 1 )
303+ logger .info ("Scraping page %s/%s ..." , page_num + 1 , len ( urls ) )
289304 try :
290305 facilities = self ._scrape_page (url )
291306 except Exception as e :
0 commit comments