77import polars
88import re
99from schemas import (
10+ default_field_office ,
1011 facilities_schema ,
1112 facility_schema ,
13+ ice_facility_types ,
1214)
1315import time
1416from typing import Tuple
@@ -34,6 +36,7 @@ def __init__(self):
3436
3537 def _download_sheet (self ) -> None :
3638 resp = session .get (self .base_xlsx_url , timeout = 120 )
39+ resp .raise_for_status ()
3740 soup = BeautifulSoup (resp .content , "html.parser" )
3841 links = soup .findAll ("a" , href = re .compile ("^https://www.ice.gov/doclib.*xlsx" ))
3942 if not links :
@@ -260,9 +263,11 @@ def _load_sheet(self) -> dict:
260263 details ["population" ]["male" ]["allowed" ] = True
261264
262265 details ["facility_type" ] = row ["Type Detailed" ]
266+ details ["facility_type_detail" ] = ice_facility_types .get (row ["Type Detailed" ], {})
263267 details ["avg_stay_length" ] = row ["FY25 ALOS" ]
264268 details ["inspection_date" ] = row ["Last Inspection End Date" ]
265269 details ["source_urls" ].append (self .sheet_url )
270+ details ["field_office" ] = default_field_office
266271 results [full_address ] = details
267272 return results
268273
@@ -274,18 +279,32 @@ def _update_facility(self, old: dict, new: dict) -> dict:
274279 old [k ] = v
275280 return old
276281
282+ def _get_scrape_pages (self ) -> list :
283+ """Discover all facility pages"""
284+ resp = session .get (self .base_scrape_url , timeout = 30 )
285+ resp .raise_for_status ()
286+ soup = BeautifulSoup (resp .content , "html.parser" )
287+ links = soup .findAll ("a" , href = re .compile (r"\?page=" ))
288+ if not links :
289+ raise Exception (f"{ self .base_scrape_url } contains *no* links?!" )
290+ pages = [
291+ f"{ self .base_scrape_url } { link ['href' ]} &exposed_form_display=1"
292+ for link in links
293+ if not any (k in link ["aria-label" ] for k in ["Next" , "Last" ])
294+ ]
295+ logger .debug ("Pages discovered: %s" , pages )
296+ return pages
297+
277298 def scrape_facilities (self ):
278299 """Scrape all ICE detention facility data from all 6 pages"""
279300 start_time = time .time ()
280301 logger .info ("Starting to scrape ICE.gov detention facilities..." )
281302 self .facilities_data ["scraped_date" ] = datetime .datetime .now (datetime .UTC )
282303 self .facilities_data ["facilities" ] = self ._load_sheet ()
283-
284- # URLs for all pages
285- urls = [f"{ self .base_scrape_url } ?exposed_form_display=1&page={ i } " for i in range (6 )]
304+ urls = self ._get_scrape_pages ()
286305
287306 for page_num , url in enumerate (urls ):
288- logger .info ("Scraping page %s/6 ..." , page_num + 1 )
307+ logger .info ("Scraping page %s/%s ..." , page_num + 1 , len ( urls ) )
289308 try :
290309 facilities = self ._scrape_page (url )
291310 except Exception as e :
@@ -308,6 +327,8 @@ def scrape_facilities(self):
308327 self .facilities_data ["facilities" ][full_address ] = self ._update_facility (
309328 self .facilities_data ["facilities" ][full_address ], facility
310329 )
330+ if facility ["field_office" ]:
331+ self .facilities_data ["facilities" ][full_address ]["field_office" ] = facility ["field_office" ]
311332 # update to the frequently nicer address from ice.gov
312333 self .facilities_data ["facilities" ][full_address ]["address" ] = addr
313334 # add scraped urls
0 commit comments