|
24 | 24 |
|
25 | 25 |
|
26 | 26 | class ICEGovFacilityScraper(object): |
27 | | - base_url = "https://www.ice.gov/detention-facilities" |
28 | | - sheet_url = "https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx" |
| 27 | + base_scrape_url = "https://www.ice.gov/detention-facilities" |
| 28 | + base_xlsx_url = "https://www.ice.gov/detain/detention-management" |
29 | 29 |
|
30 | 30 | # All methods for scraping ice.gov websites |
31 | 31 | def __init__(self): |
32 | 32 | self.facilities_data = copy.deepcopy(facilities_schema) |
33 | 33 | self.filename = f"{SCRIPT_DIR}{os.sep}detentionstats.xlsx" |
34 | 34 |
|
35 | 35 | def _download_sheet(self) -> None: |
36 | | - if not os.path.isfile(self.filename) or os.path.getsize(self.filename) < 1: |
37 | | - logger.info("Downloading sheet from %s", self.sheet_url) |
38 | | - resp = session.get(self.sheet_url, timeout=120) |
39 | | - with open(self.filename, "wb") as f: |
40 | | - for chunk in resp.iter_content(chunk_size=1024): |
41 | | - if chunk: |
42 | | - f.write(chunk) |
| 36 | + resp = session.get(self.base_xlsx_url, timeout=120) |
| 37 | + soup = BeautifulSoup(resp.content, "html.parser") |
| 38 | + links = soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx")) |
| 39 | + if not links: |
| 40 | + raise Exception(f"Could not find any XLSX files on {self.base_xlsx_url}") |
| 41 | + fy_re = re.compile(r".+FY(\d{2}).+") |
| 42 | + actual_link = links[0]["href"] |
| 43 | + cur_year = int(datetime.datetime.now().strftime("%y")) |
| 44 | + # try to find the most recent |
| 45 | + for link in links: |
| 46 | + match = fy_re.match(link["href"]) |
| 47 | + if not match: |
| 48 | + continue |
| 49 | + year = int(match.group(1)) |
| 50 | + if year >= cur_year: |
| 51 | + actual_link = link["href"] |
| 52 | + cur_year = year |
| 53 | + |
| 54 | + logger.debug("Found sheet at: %s", actual_link) |
| 55 | + self.sheet_url = actual_link |
| 56 | + logger.info("Downloading detention stats sheet from %s", self.sheet_url) |
| 57 | + resp = session.get(self.sheet_url, timeout=120) |
| 58 | + with open(self.filename, "wb") as f: |
| 59 | + for chunk in resp.iter_content(chunk_size=1024): |
| 60 | + if chunk: |
| 61 | + f.write(chunk) |
43 | 62 |
|
44 | 63 | def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]: |
45 | 64 | """Generally, we'll let the spreadsheet win arguments just to be consistent""" |
@@ -263,7 +282,7 @@ def scrape_facilities(self): |
263 | 282 | self.facilities_data["facilities"] = self._load_sheet() |
264 | 283 |
|
265 | 284 | # URLs for all pages |
266 | | - urls = [f"{self.base_url}?exposed_form_display=1&page={i}" for i in range(6)] |
| 285 | + urls = [f"{self.base_scrape_url}?exposed_form_display=1&page={i}" for i in range(6)] |
267 | 286 |
|
268 | 287 | for page_num, url in enumerate(urls): |
269 | 288 | logger.info("Scraping page %s/6...", page_num + 1) |
|
0 commit comments